Compare commits

..

4 Commits

Author SHA1 Message Date
Lucas Wilkinson
275de34170 [BugFix] Fix false assertion with spec-decode=[2,4,..] and TP>2 (#29036)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
(cherry picked from commit 8f4f77a727)
2025-11-19 14:11:21 -08:00
Julien Denize
fa3ffb4365 [BugFix] Ray with multiple nodes (#28873)
Signed-off-by: Julien Denize <julien.denize@mistral.ai>
(cherry picked from commit cdeec2e606)
2025-11-19 14:11:08 -08:00
Lucas Wilkinson
6d5974369c [BugFix] Fix async-scheduling + FlashAttn MLA (#28990)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
(cherry picked from commit 48fc8b1e59)
2025-11-19 14:10:50 -08:00
Johnny
0ce9990d2c [NVIDIA] Guard SM100 CUTLASS MoE macro to SM100 builds v2 (#28938)
Signed-off-by: johnnynunez <johnnynuca14@gmail.com>
Signed-off-by: Johnny <johnnynuca14@gmail.com>
(cherry picked from commit 49ef847aa8)
2025-11-19 14:10:37 -08:00
1518 changed files with 35347 additions and 90548 deletions

View File

@@ -1,24 +0,0 @@
name: vllm_ci
job_dirs:
- ".buildkite/test_areas"
- ".buildkite/image_build"
run_all_patterns:
- "docker/Dockerfile"
- "CMakeLists.txt"
- "requirements/common.txt"
- "requirements/cuda.txt"
- "requirements/build.txt"
- "requirements/test.txt"
- "setup.py"
- "csrc/"
- "cmake/"
run_all_exclude_patterns:
- "docker/Dockerfile."
- "csrc/cpu/"
- "csrc/rocm/"
- "cmake/hipify.py"
- "cmake/cpu_extension.cmake"
registries: public.ecr.aws/q9t5s3a7
repositories:
main: "vllm-ci-postmerge-repo"
premerge: "vllm-ci-test-repo"

View File

@@ -0,0 +1,46 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import os
template = """<!DOCTYPE html>
<html>
<body>
<h1>Links for vLLM</h1/>
<a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
<a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
</body>
</html>
"""
parser = argparse.ArgumentParser()
parser.add_argument("--wheel", help="The wheel path.", required=True)
args = parser.parse_args()
filename = os.path.basename(args.wheel)
with open("index.html", "w") as f:
print(f"Generated index.html for {args.wheel}")
# sync the abi tag with .buildkite/scripts/upload-wheels.sh
if "x86_64" in filename:
x86_wheel = filename
arm_wheel = filename.replace("x86_64", "aarch64").replace(
"manylinux1", "manylinux2014"
)
elif "aarch64" in filename:
x86_wheel = filename.replace("aarch64", "x86_64").replace(
"manylinux2014", "manylinux1"
)
arm_wheel = filename
else:
raise ValueError(f"Unsupported wheel: {filename}")
# cloudfront requires escaping the '+' character
f.write(
template.format(
x86_wheel=x86_wheel,
x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
arm_wheel=arm_wheel,
arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
)
)

View File

@@ -1,56 +0,0 @@
#!/bin/bash
set -e
if [[ $# -lt 8 ]]; then
echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
exit 1
fi
REGISTRY=$1
REPO=$2
BUILDKITE_COMMIT=$3
BRANCH=$4
VLLM_USE_PRECOMPILED=$5
VLLM_MERGE_BASE_COMMIT=$6
CACHE_FROM=$7
CACHE_TO=$8
# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
# docker buildx
docker buildx create --name vllm-builder --driver docker-container --use
docker buildx inspect --bootstrap
docker buildx ls
# skip build if image already exists
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
exit 0
fi
if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
else
merge_base_commit_build_args=""
fi
# build
docker buildx build --file docker/Dockerfile \
--build-arg max_jobs=16 \
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
--build-arg USE_SCCACHE=1 \
--build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
--build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
--build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
${merge_base_commit_build_args} \
--cache-from type=registry,ref=${CACHE_FROM},mode=max \
--cache-to type=registry,ref=${CACHE_TO},mode=max \
--tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
$( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
--push \
--target test \
--progress plain .

View File

@@ -1,57 +0,0 @@
group: Abuild
steps:
- label: ":docker: Build image"
key: image-build
depends_on: []
commands:
- .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 2
- exit_status: -10 # Agent was lost
limit: 2
- label: ":docker: Build CPU image"
key: image-build-cpu
depends_on: []
commands:
- .buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 2
- exit_status: -10 # Agent was lost
limit: 2
- label: ":docker: Build HPU image"
soft_fail: true
depends_on: []
key: image-build-hpu
commands:
- .buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 2
- exit_status: -10 # Agent was lost
limit: 2
- label: ":docker: Build CPU arm64 image"
key: cpu-arm64-image-build
depends_on: []
optional: true
commands:
- .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 2
- exit_status: -10 # Agent was lost
limit: 2

View File

@@ -1,36 +0,0 @@
#!/bin/bash
set -e
if [[ $# -lt 3 ]]; then
echo "Usage: $0 <registry> <repo> <commit>"
exit 1
fi
REGISTRY=$1
REPO=$2
BUILDKITE_COMMIT=$3
# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
# skip build if image already exists
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
exit 0
fi
# build
docker build --file docker/Dockerfile.cpu \
--build-arg max_jobs=16 \
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
--build-arg VLLM_CPU_AVX512BF16=true \
--build-arg VLLM_CPU_AVX512VNNI=true \
--build-arg VLLM_CPU_AMXBF16=true \
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
--target vllm-test \
--progress plain .
# push
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu

View File

@@ -1,33 +0,0 @@
#!/bin/bash
set -e
if [[ $# -lt 3 ]]; then
echo "Usage: $0 <registry> <repo> <commit>"
exit 1
fi
REGISTRY=$1
REPO=$2
BUILDKITE_COMMIT=$3
# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
# skip build if image already exists
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
exit 0
fi
# build
docker build --file docker/Dockerfile.cpu \
--build-arg max_jobs=16 \
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
--target vllm-test \
--progress plain .
# push
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu

View File

@@ -1,34 +0,0 @@
#!/bin/bash
set -e
if [[ $# -lt 3 ]]; then
echo "Usage: $0 <registry> <repo> <commit>"
exit 1
fi
REGISTRY=$1
REPO=$2
BUILDKITE_COMMIT=$3
# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
# skip build if image already exists
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
exit 0
fi
# build
docker build \
--file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
--build-arg max_jobs=16 \
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
--progress plain \
https://github.com/vllm-project/vllm-gaudi.git
# push
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu

View File

@@ -8,4 +8,3 @@ tasks:
value: 0.80 value: 0.80
limit: 250 # will run on 250 * 14 subjects = 3500 samples limit: 250 # will run on 250 * 14 subjects = 3500 samples
num_fewshot: 5 num_fewshot: 5
rtol: 0.05

View File

@@ -1 +0,0 @@
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml

View File

@@ -9,40 +9,11 @@ pytest -s -v test_lm_eval_correctness.py \
--tp-size=1 --tp-size=1
""" """
import os
from contextlib import contextmanager
import lm_eval import lm_eval
import numpy as np import numpy as np
import yaml import yaml
DEFAULT_RTOL = 0.08 RTOL = 0.08
@contextmanager
def scoped_env_vars(new_env: dict[str, str]):
if not new_env:
# Fast path: nothing to do
yield
return
old_values = {}
new_keys = []
try:
for key, value in new_env.items():
if key in os.environ:
old_values[key] = os.environ[key]
else:
new_keys.append(key)
os.environ[key] = str(value)
yield
finally:
# Restore / clean up
for key, value in old_values.items():
os.environ[key] = value
for key in new_keys:
os.environ.pop(key, None)
def launch_lm_eval(eval_config, tp_size): def launch_lm_eval(eval_config, tp_size):
@@ -61,26 +32,23 @@ def launch_lm_eval(eval_config, tp_size):
f"trust_remote_code={trust_remote_code}," f"trust_remote_code={trust_remote_code},"
f"max_model_len={max_model_len}," f"max_model_len={max_model_len},"
) )
results = lm_eval.simple_evaluate(
env_vars = eval_config.get("env_vars", None) model=backend,
with scoped_env_vars(env_vars): model_args=model_args,
results = lm_eval.simple_evaluate( tasks=[task["name"] for task in eval_config["tasks"]],
model=backend, num_fewshot=eval_config["num_fewshot"],
model_args=model_args, limit=eval_config["limit"],
tasks=[task["name"] for task in eval_config["tasks"]], # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
num_fewshot=eval_config["num_fewshot"], # text models. however, this is regressing measured strict-match for
limit=eval_config["limit"], # existing text models in CI, so only apply it for mm, or explicitly set
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help apply_chat_template=eval_config.get(
# text models. however, this is regressing measured strict-match for "apply_chat_template", backend == "vllm-vlm"
# existing text models in CI, so only apply it for mm, or explicitly set ),
apply_chat_template=eval_config.get( fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
"apply_chat_template", backend == "vllm-vlm" # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
), gen_kwargs=eval_config.get("gen_kwargs"),
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False), batch_size=batch_size,
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...) )
gen_kwargs=eval_config.get("gen_kwargs"),
batch_size=batch_size,
)
return results return results
@@ -89,8 +57,6 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
results = launch_lm_eval(eval_config, tp_size) results = launch_lm_eval(eval_config, tp_size)
rtol = eval_config.get("rtol", DEFAULT_RTOL)
success = True success = True
for task in eval_config["tasks"]: for task in eval_config["tasks"]:
for metric in task["metrics"]: for metric in task["metrics"]:
@@ -98,9 +64,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
measured_value = results["results"][task["name"]][metric["name"]] measured_value = results["results"][task["name"]][metric["name"]]
print( print(
f"{task['name']} | {metric['name']}: " f"{task['name']} | {metric['name']}: "
f"ground_truth={ground_truth:.3f} | " f"ground_truth={ground_truth} | measured={measured_value}"
f"measured={measured_value:.3f} | rtol={rtol}"
) )
success = success and np.isclose(ground_truth, measured_value, rtol=rtol) success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
assert success assert success

View File

@@ -108,65 +108,6 @@ The number of this test is less stable compared to the delay and latency benchma
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`. WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
#### Default Parameters Field
We can specify default parameters in a JSON field with key `defaults`. Parameters defined in the field are applied globally to all serving tests, and can be overridden in test case fields. Here is an example:
<details>
<summary> An Example of default parameters field </summary>
```json
{
"defaults": {
"qps_list": [
"inf"
],
"server_environment_variables": {
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
},
"server_parameters": {
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"block_size": 128,
"disable_log_stats": "",
"load_format": "dummy"
},
"client_parameters": {
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"num_prompts": 200,
"ignore-eos": ""
}
},
"tests": [
{
"test_name": "serving_llama3B_tp2_random_128_128",
"server_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"tensor_parallel_size": 2,
},
"client_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
}
},
{
"test_name": "serving_qwen3_tp4_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-14B",
"tensor_parallel_size": 4,
},
"client_parameters": {
"model": "Qwen/Qwen3-14B",
}
},
]
}
```
</details>
### Visualizing the results ### Visualizing the results
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results. The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.

View File

@@ -110,8 +110,7 @@ json2envs() {
wait_for_server() { wait_for_server() {
# wait for vllm server to start # wait for vllm server to start
# return 1 if vllm server crashes # return 1 if vllm server crashes
local timeout_val="1200" timeout 1200 bash -c '
timeout "$timeout_val" bash -c '
until curl -X POST localhost:8000/v1/completions; do until curl -X POST localhost:8000/v1/completions; do
sleep 1 sleep 1
done' && return 0 || return 1 done' && return 0 || return 1
@@ -317,44 +316,12 @@ run_throughput_tests() {
run_serving_tests() { run_serving_tests() {
# run serving tests using `vllm bench serve` command # run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases # $1: a json file specifying serving test cases
#
# Supported JSON formats:
# 1) Plain format: top-level array
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
#
# 2) Default parameters field + plain format tests
# {
# "defaults": { ... },
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
# }
local serving_test_file local serving_test_file
serving_test_file=$1 serving_test_file=$1
# Iterate over serving tests # Iterate over serving tests
jq -c ' jq -c '.[]' "$serving_test_file" | while read -r params; do
if type == "array" then
# Plain format: test cases array
.[]
elif (type == "object" and has("tests")) then
# merge the default parameters into each test cases
. as $root
| ($root.defaults // {}) as $d
| ($root.tests // [])[]
# default qps / max_concurrency from defaults if missing
| .qps_list = (.qps_list // $d.qps_list)
| .max_concurrency_list = (.max_concurrency_list // $d.max_concurrency_list)
# merge envs / params: test overrides defaults
| .server_environment_variables =
(($d.server_environment_variables // {}) + (.server_environment_variables // {}))
| .server_parameters =
(($d.server_parameters // {}) + (.server_parameters // {}))
| .client_parameters =
(($d.client_parameters // {}) + (.client_parameters // {}))
else
error("Unsupported serving test file format: must be array or object with .tests")
end
' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it. # get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name') test_name=$(echo "$params" | jq -r '.test_name')
if [[ ! "$test_name" =~ ^serving_ ]]; then if [[ ! "$test_name" =~ ^serving_ ]]; then
@@ -368,25 +335,20 @@ run_serving_tests() {
continue continue
fi fi
# get client and server arguments (after merged the default parameters) # get client and server arguments
server_params=$(echo "$params" | jq -r '.server_parameters') server_params=$(echo "$params" | jq -r '.server_parameters')
server_envs=$(echo "$params" | jq -r '.server_environment_variables') server_envs=$(echo "$params" | jq -r '.server_environment_variables')
client_params=$(echo "$params" | jq -r '.client_parameters') client_params=$(echo "$params" | jq -r '.client_parameters')
server_args=$(json2args "$server_params") server_args=$(json2args "$server_params")
server_envs=$(json2envs "$server_envs") server_envs=$(json2envs "$server_envs")
client_args=$(json2args "$client_params") client_args=$(json2args "$client_params")
# qps_list
qps_list=$(echo "$params" | jq -r '.qps_list') qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list" echo "Running over qps list $qps_list"
# max_concurrency_list (fallback to num_prompts if missing)
max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list') max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
num_prompts=$(echo "$client_params" | jq -r '.num_prompts') num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
max_concurrency_list="[$num_prompts]" max_concurrency_list="[$num_prompts]"
fi fi
max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh') max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
echo "Running over max concurrency list $max_concurrency_list" echo "Running over max concurrency list $max_concurrency_list"

View File

@@ -0,0 +1,610 @@
[
{
"test_name": "serving_llama8B_bf16_tp1_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_bf16_tp2_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_bf16_tp4_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_bf16_tp1_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_bf16_tp2_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_bf16_tp4_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int8_tp1_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int8_tp2_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int8_tp4_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int8_tp1_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int8_tp2_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int8_tp4_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int4_tp1_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int4_tp2_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int4_tp4_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int4_tp1_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int4_tp2_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int4_tp4_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
}
]

File diff suppressed because it is too large Load Diff

View File

@@ -1,246 +1,276 @@
{ [
"defaults": { {
"qps_list": [ "test_name": "serving_llama8B_tp1_sharegpt",
"inf" "qps_list": [1, 4, 16, "inf"],
], "max_concurrency_list": [32],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": {
"server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000,
"VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, "VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40
"VLLM_CPU_KVCACHE_SPACE": 40 },
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 32
}
}, },
"server_parameters": { {
"model": "meta-llama/Llama-3.1-8B-Instruct", "test_name": "serving_llama8B_tp2_sharegpt",
"tensor_parallel_size": 1, "qps_list": [1, 4, 16, "inf"],
"dtype": "bfloat16", "max_concurrency_list": [32],
"distributed_executor_backend": "mp", "server_environment_variables": {
"block_size": 128, "VLLM_RPC_TIMEOUT": 100000,
"trust_remote_code": "", "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"disable_log_stats": "", "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"enforce_eager": "", "VLLM_CPU_SGL_KERNEL": 1,
"max_num_batched_tokens": 2048, "VLLM_CPU_KVCACHE_SPACE": 40
"max_num_seqs": 256, },
"load_format": "dummy" "server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 32
}
}, },
"client_parameters": { {
"model": "meta-llama/Llama-3.1-8B-Instruct", "test_name": "serving_llama8B_tp1_random_128_128",
"backend": "vllm", "qps_list": [1, 4, 16, "inf"],
"ignore-eos": "", "max_concurrency_list": [32],
"num_prompts": 200 "server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 32
}
},
{
"test_name": "serving_llama8B_tp2_random_128_128",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 32
}
},
{
"test_name": "serving_llama8B_tp1_random_128_2048",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048,
"ignore-eos": "",
"num_prompts": 32
}
},
{
"test_name": "serving_llama8B_tp2_random_128_2048",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048,
"ignore-eos": "",
"num_prompts": 32
}
},
{
"test_name": "serving_llama8B_tp1_random_2048_128",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 32
}
},
{
"test_name": "serving_llama8B_tp2_random_2048_128",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 32
}
} }
}, ]
"tests": [
{
"test_name": "serving_llama8B_tp1_sharegpt",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name": "serving_llama8B_tp2_sharegpt",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name": "serving_llama8B_tp1_random_128_128",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp2_random_128_128",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp4_random_128_128",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp1_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp2_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp4_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp1_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp2_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp4_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama3B_tp1_random_128_128",
"server_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_granite2B_tp1_random_128_128",
"server_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen1.7B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-1.7B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-1.7B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen4B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-4B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-4B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen8B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-8B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-8B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_glm9B_tp1_random_128_128",
"server_parameters": {
"model": "zai-org/glm-4-9b-hf",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "zai-org/glm-4-9b-hf",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_gemma7B_tp1_random_128_128",
"server_parameters": {
"model": "google/gemma-7b",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "google/gemma-7b",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
}
]
}

View File

@@ -8,28 +8,13 @@ steps:
commands: commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here: # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7 # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh" - "bash .buildkite/scripts/upload-wheels.sh"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
- label: "Build arm64 wheel - CUDA 13.0"
depends_on: ~
id: build-wheel-arm64-cuda-13-0
agents:
queue: arm64_cpu_queue_postmerge
commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
# aarch64 build # aarch64 build
- label: "Build arm64 CPU wheel" - label: "Build arm64 CPU wheel"
depends_on: ~ depends_on: ~
@@ -40,11 +25,24 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35" - "bash .buildkite/scripts/upload-wheels.sh"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
# x86 + CUDA builds # x86 + CUDA builds
- label: "Build wheel - CUDA 12.8"
depends_on: ~
id: build-wheel-cuda-12-8
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
- label: "Build wheel - CUDA 12.9" - label: "Build wheel - CUDA 12.9"
depends_on: ~ depends_on: ~
id: build-wheel-cuda-12-9 id: build-wheel-cuda-12-9
@@ -54,7 +52,7 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_31" - "bash .buildkite/scripts/upload-wheels.sh"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
@@ -67,21 +65,7 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35" - "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
# x86 CPU wheel build
- label: "Build x86 CPU wheel"
depends_on: ~
id: build-wheel-x86-cpu
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
@@ -125,6 +109,7 @@ steps:
- label: "Annotate release workflow" - label: "Annotate release workflow"
depends_on: depends_on:
- create-multi-arch-manifest - create-multi-arch-manifest
- build-wheel-cuda-12-8
id: annotate-release-workflow id: annotate-release-workflow
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge

View File

@@ -23,8 +23,8 @@ To download the wheel (by version):
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl . aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl . aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl . aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl .
\`\`\` \`\`\`
To download and upload the image: To download and upload the image:
@@ -45,9 +45,8 @@ docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker push vllm/vllm-openai:latest-aarch64 docker push vllm/vllm-openai:latest-aarch64
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker manifest rm vllm/vllm-openai:latest docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker manifest push vllm/vllm-openai:latest docker manifest push vllm/vllm-openai:latest
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION} docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
\`\`\` \`\`\`

View File

@@ -1,400 +0,0 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# do not complain about line length (for docstring)
# ruff: noqa: E501
import argparse
import json
import sys
from dataclasses import asdict, dataclass
from datetime import datetime
from pathlib import Path
from typing import Any
from urllib.parse import quote
import regex as re
if not sys.version_info >= (3, 12):
raise RuntimeError("This script requires Python 3.12 or higher.")
INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
<html>
<!-- {comment} -->
<meta name="pypi:repository-version" content="1.0">
<body>
{items}
</body>
</html>
"""
@dataclass
class WheelFileInfo:
package_name: str
version: str
build_tag: str | None
python_tag: str
abi_tag: str
platform_tag: str
variant: str | None
filename: str
def parse_from_filename(file: str) -> WheelFileInfo:
"""
Parse wheel file name to extract metadata.
The format of wheel names:
{package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl
All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not).
Example:
vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl
vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl
vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl
vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl
"""
wheel_file_re = re.compile(
r"^(?P<package_name>.+)-(?P<version>[^-]+?)(-(?P<build_tag>[^-]+))?-(?P<python_tag>[^-]+)-(?P<abi_tag>[^-]+)-(?P<platform_tag>[^-]+)\.whl$"
)
match = wheel_file_re.match(file)
if not match:
raise ValueError(f"Invalid wheel file name: {file}")
package_name = match.group("package_name")
version = match.group("version")
build_tag = match.group("build_tag")
python_tag = match.group("python_tag")
abi_tag = match.group("abi_tag")
platform_tag = match.group("platform_tag")
# extract variant from version
variant = None
if "dev" in version:
ver_after_dev = version.split("dev")[-1]
if "." in ver_after_dev:
variant = ver_after_dev.split(".")[-1]
version = version.removesuffix("." + variant)
else:
if "+" in version:
version, variant = version.split("+")
return WheelFileInfo(
package_name=package_name,
version=version,
build_tag=build_tag,
python_tag=python_tag,
abi_tag=abi_tag,
platform_tag=platform_tag,
variant=variant,
filename=file,
)
def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
"""
Generate project list HTML content linking to each project & variant sub-directory.
"""
href_tags = []
for name in sorted(subdir_names):
name = name.strip("/").strip(".")
href_tags.append(f' <a href="{name}/">{name}/</a><br/>')
return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
def generate_package_index_and_metadata(
wheel_files: list[WheelFileInfo],
wheel_base_dir: Path,
index_base_dir: Path,
comment: str = "",
) -> tuple[str, str]:
"""
Generate package index HTML content for a specific package, linking to actual wheel files.
"""
href_tags = []
metadata = []
for file in sorted(wheel_files, key=lambda x: x.filename):
relative_path = (
wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
)
# handle with '+' in URL, and avoid double-encoding '/' and already-encoded '%2B'
# NOTE: this is AWS S3 specific behavior!
file_path_quoted = quote(relative_path.as_posix(), safe=":%/")
href_tags.append(f' <a href="{file_path_quoted}">{file.filename}</a><br/>')
file_meta = asdict(file)
file_meta["path"] = file_path_quoted
metadata.append(file_meta)
index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
metadata_str = json.dumps(metadata, indent=2)
return index_str, metadata_str
def generate_index_and_metadata(
whl_files: list[str],
wheel_base_dir: Path,
index_base_dir: Path,
default_variant: str | None = None,
alias_to_default: str | None = None,
comment: str = "",
):
"""
Generate index for all wheel files.
Args:
whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`).
wheel_base_dir (Path): Base directory for wheel files.
index_base_dir (Path): Base directory to store index files.
default_variant (str | None): The default variant name, if any.
alias_to_default (str | None): Alias variant name for the default variant, if any.
comment (str | None): Optional comment to include in the generated HTML files.
First, parse all wheel files to extract metadata.
We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
The index for the default variant (if any) is generated in the root index directory.
If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
is purely a copy of the corresponding variant index, with only the links adjusted.
Otherwise, all wheels without variant suffixes are treated as the default variant.
If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
as the default variant index, but the links are adjusted accordingly.
Index directory structure:
index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
index.html # project list, linking to "vllm/" and other packages, and all variant sub-directories
vllm/
index.html # package index, pointing to actual files in wheel_base_dir (relative path)
metadata.json # machine-readable metadata for all wheels in this package
cpu/ # cpu variant sub-directory
index.html
vllm/
index.html
metadata.json
cu129/ # cu129 is actually the alias to default variant
index.html
vllm/
index.html
metadata.json
cu130/ # cu130 variant sub-directory
index.html
vllm/
index.html
metadata.json
...
metadata.json stores a dump of all wheel files' metadata in a machine-readable format:
[
{
"package_name": "vllm",
"version": "0.10.2rc2",
"build_tag": null,
"python_tag": "cp38",
"abi_tag": "abi3",
"platform_tag": "manylinux2014_aarch64",
"variant": "cu129",
"filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl",
"path": "../vllm-0.10.2rc2%2Bcu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL and URL-encoded
},
...
]
"""
parsed_files = [parse_from_filename(f) for f in whl_files]
if not parsed_files:
print("No wheel files found, skipping index generation.")
return
# Group by variant
variant_to_files: dict[str, list[WheelFileInfo]] = {}
for file in parsed_files:
variant = file.variant or "default"
if variant not in variant_to_files:
variant_to_files[variant] = []
variant_to_files[variant].append(file)
print(f"Found variants: {list(variant_to_files.keys())}")
# sanity check for default variant
if default_variant:
if "default" in variant_to_files:
raise ValueError(
"All wheel files must have variant suffixes when `default_variant` is specified."
)
if default_variant not in variant_to_files:
raise ValueError(
f"Default variant '{default_variant}' not found among wheel files."
)
if alias_to_default:
if "default" not in variant_to_files:
# e.g. only some wheels are uploaded to S3 currently
print(
"[WARN] Alias to default variant specified, but no default variant found."
)
elif alias_to_default in variant_to_files:
raise ValueError(
f"Alias variant name '{alias_to_default}' already exists among wheel files."
)
else:
variant_to_files[alias_to_default] = variant_to_files["default"].copy()
print(f"Alias variant '{alias_to_default}' created for default variant.")
# Generate comment in HTML header
comment_str = f" ({comment})" if comment else ""
comment_tmpl = f"Generated on {datetime.now().isoformat()}{comment_str}"
# Generate index for each variant
subdir_names = set()
for variant, files in variant_to_files.items():
if variant == "default":
variant_dir = index_base_dir
else:
variant_dir = index_base_dir / variant
subdir_names.add(variant)
variant_dir.mkdir(parents=True, exist_ok=True)
# gather all package names in this variant
packages = set(f.package_name for f in files)
if variant == "default":
# these packages should also appear in the "project list"
# generate after all variants are processed
subdir_names = subdir_names.union(packages)
else:
# generate project list for this variant directly
project_list_str = generate_project_list(sorted(packages), comment_tmpl)
with open(variant_dir / "index.html", "w") as f:
f.write(project_list_str)
for package in packages:
# filter files belonging to this package only
package_files = [f for f in files if f.package_name == package]
package_dir = variant_dir / package
package_dir.mkdir(parents=True, exist_ok=True)
index_str, metadata_str = generate_package_index_and_metadata(
package_files, wheel_base_dir, package_dir, comment
)
with open(package_dir / "index.html", "w") as f:
f.write(index_str)
with open(package_dir / "metadata.json", "w") as f:
f.write(metadata_str)
# Generate top-level project list index
project_list_str = generate_project_list(sorted(subdir_names), comment_tmpl)
with open(index_base_dir / "index.html", "w") as f:
f.write(project_list_str)
if __name__ == "__main__":
"""
Arguments:
--version <version> : version string for the current build (e.g., commit hash)
--current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
--output-dir <output_directory> : directory to store generated index files
--alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
--comment <comment_string> : (optional) comment string to include in generated HTML files
"""
parser = argparse.ArgumentParser(
description="Process nightly build wheel files to generate indices."
)
parser.add_argument(
"--version",
type=str,
required=True,
help="Version string for the current build (e.g., commit hash)",
)
parser.add_argument(
"--current-objects",
type=str,
required=True,
help="Path to JSON file containing current S3 objects listing in this version directory",
)
parser.add_argument(
"--output-dir",
type=str,
required=True,
help="Directory to store generated index files",
)
parser.add_argument(
"--alias-to-default",
type=str,
default=None,
help="Alias variant name for the default variant",
)
parser.add_argument(
"--comment",
type=str,
default="",
help="Optional comment string to include in generated HTML files",
)
args = parser.parse_args()
version = args.version
if "/" in version or "\\" in version:
raise ValueError("Version string must not contain slashes.")
current_objects_path = Path(args.current_objects)
output_dir = Path(args.output_dir)
if not output_dir.exists():
output_dir.mkdir(parents=True, exist_ok=True)
# Read current objects JSON
with open(current_objects_path) as f:
current_objects: dict[str, list[dict[str, Any]]] = json.load(f)
# current_objects looks like from list_objects_v2 S3 API:
"""
"Contents": [
{
"Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl",
"LastModified": "2025-11-28T14:00:32+00:00",
"ETag": "\"37a38339c7cdb61ca737021b968075df-52\"",
"ChecksumAlgorithm": [
"CRC64NVME"
],
"ChecksumType": "FULL_OBJECT",
"Size": 435649349,
"StorageClass": "STANDARD"
},
...
]
"""
# Extract wheel file keys
wheel_files = []
for item in current_objects.get("Contents", []):
key: str = item["Key"]
if key.endswith(".whl"):
wheel_files.append(key.split("/")[-1]) # only the filename is used
print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
# keep only "official" files for a non-nightly version (specifed by cli args)
PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$")
if PY_VERSION_RE.match(version):
# upload-wheels.sh ensures no "dev" is in args.version
wheel_files = list(
filter(lambda x: version in x and "dev" not in x, wheel_files)
)
print(f"Non-nightly version detected, wheel files used: {wheel_files}")
else:
print("Nightly version detected, keeping all wheel files.")
# Generate index and metadata, assuming wheels and indices are stored as:
# s3://vllm-wheels/{version}/<wheel files>
# s3://vllm-wheels/<anything>/<index files>
wheel_base_dir = Path(output_dir).parent / version
index_base_dir = Path(output_dir)
generate_index_and_metadata(
whl_files=wheel_files,
wheel_base_dir=wheel_base_dir,
index_base_dir=index_base_dir,
default_variant=None,
alias_to_default=args.alias_to_default,
comment=args.comment.strip(),
)
print(f"Successfully generated index and metadata in {output_dir}")

View File

@@ -1,68 +0,0 @@
#!/bin/bash
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
# allow to bind to different cores
CORE_RANGE=${CORE_RANGE:-0-16}
OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16}
export CMAKE_BUILD_PARALLEL_LEVEL=16
# Setup cleanup
remove_docker_container() {
set -e;
docker rm -f cpu-test || true;
}
trap remove_docker_container EXIT
remove_docker_container
# Try building the docker image
docker build --tag cpu-test --target vllm-test -f docker/Dockerfile.cpu .
# Run the image
docker run -itd --cpuset-cpus="$CORE_RANGE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test cpu-test
function cpu_tests() {
set -e
docker exec cpu-test bash -c "
set -e
pip list"
# offline inference
docker exec cpu-test bash -c "
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run model tests
docker exec cpu-test bash -c "
set -e
pytest -x -v -s tests/models/multimodal/generation/test_whisper.py -m cpu_model"
# Run kernel tests
docker exec cpu-test bash -c "
set -e
pytest -x -v -s tests/kernels/test_onednn.py
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
pytest -x -v -s tests/kernels/moe/test_moe.py -k test_cpu_fused_moe_basic"
# basic online serving
docker exec cpu-test bash -c '
set -e
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve Qwen/Qwen3-0.6B --max-model-len 2048 &
server_pid=$!
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
vllm bench serve \
--backend vllm \
--dataset-name random \
--model Qwen/Qwen3-0.6B \
--num-prompts 20 \
--endpoint /v1/completions
kill -s SIGTERM $server_pid &'
}
# All of CPU tests are expected to be finished less than 40 mins.
export -f cpu_tests
timeout 2h bash -c cpu_tests

View File

@@ -25,22 +25,20 @@ function cpu_tests() {
# offline inference # offline inference
podman exec -it "$container_id" bash -c " podman exec -it "$container_id" bash -c "
export TORCH_COMPILE_DISABLE=1
set -xve set -xve
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
# Run basic model test # Run basic model test
podman exec -it "$container_id" bash -c " podman exec -it "$container_id" bash -c "
export TORCH_COMPILE_DISABLE=1
set -evx set -evx
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
pip install sentence-transformers datamodel_code_generator tblib pip install sentence-transformers datamodel_code_generator
# Note: disable Bart until supports V1 # Note: disable Bart until supports V1
# pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model # pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-openai-community/gpt2] pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-facebook/opt-125m] pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it] pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach] pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being. # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log

View File

@@ -21,8 +21,8 @@ trap remove_docker_container EXIT
remove_docker_container remove_docker_container
# Try building the docker image # Try building the docker image
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu . numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu . numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel. # Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
@@ -73,11 +73,12 @@ function cpu_tests() {
pytest -x -s -v \ pytest -x -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs" tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
# Run AWQ/GPTQ test # Note: disable it until supports V1
docker exec cpu-test-"$NUMA_NODE" bash -c " # Run AWQ test
set -e # docker exec cpu-test-"$NUMA_NODE" bash -c "
pytest -x -s -v \ # set -e
tests/quantization/test_cpu_wna16.py" # pytest -x -s -v \
# tests/quantization/test_ipex_quant.py"
# Run multi-lora tests # Run multi-lora tests
docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test-"$NUMA_NODE" bash -c "

View File

@@ -74,7 +74,6 @@ FROM ${BASE_IMAGE_NAME}
# Define environments # Define environments
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
ENV SOC_VERSION="ascend910b1"
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \ RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \ pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \

View File

@@ -35,10 +35,9 @@ docker run \
echo $ZE_AFFINITY_MASK echo $ZE_AFFINITY_MASK
pip install tblib==3.1.0 pip install tblib==3.1.0
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
cd tests cd tests
pytest -v -s v1/core pytest -v -s v1/core
@@ -47,6 +46,6 @@ docker run \
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
pytest -v -s v1/structured_output pytest -v -s v1/structured_output
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
pytest -v -s v1/test_serial_utils.py pytest -v -s v1/test_serial_utils.py
' '

View File

@@ -12,11 +12,6 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git" PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
PRIME_RL_DIR="${REPO_ROOT}/prime-rl" PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
exit 0
fi
echo "Setting up Prime-RL integration test environment..." echo "Setting up Prime-RL integration test environment..."
# Clean up any existing Prime-RL directory # Clean up any existing Prime-RL directory

View File

@@ -17,17 +17,7 @@ wait_for_server() {
} }
MODEL="deepseek-ai/DeepSeek-V2-lite" MODEL="deepseek-ai/DeepSeek-V2-lite"
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
# Set BACKENDS based on platform
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
# ROCm platform
BACKENDS=("allgather_reducescatter")
# Disable MOE padding for ROCm since it is causing eplb to fail
export VLLM_ROCM_MOE_PADDING=0
else
# Non-ROCm platform (CUDA/other)
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
fi
cleanup() { cleanup() {
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then

View File

@@ -2,9 +2,9 @@
set -euxo pipefail set -euxo pipefail
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] # args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
THRESHOLD=${1:-0.25} THRESHOLD=${1:-0.8}
NUM_Q=${2:-1319} NUM_Q=${2:-1319}
PORT=${3:-8040} PORT=${3:-8020}
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled} OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
mkdir -p "${OUT_DIR}" mkdir -p "${OUT_DIR}"
@@ -16,18 +16,8 @@ wait_for_server() {
done' done'
} }
MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct" MODEL="QWen/Qwen3-30B-A3B-FP8"
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
# Set BACKENDS based on platform
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
# ROCm platform
BACKENDS=("allgather_reducescatter")
# Disable MOE padding for ROCm since it is causing eplb to fail
export VLLM_ROCM_MOE_PADDING=0
else
# Non-ROCm platform (CUDA/other)
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
fi
cleanup() { cleanup() {
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
@@ -46,14 +36,11 @@ for BACK in "${BACKENDS[@]}"; do
VLLM_ALL2ALL_BACKEND=$BACK \ VLLM_ALL2ALL_BACKEND=$BACK \
vllm serve "$MODEL" \ vllm serve "$MODEL" \
--enforce-eager \ --enforce-eager \
--tensor-parallel-size 4 \ --tensor-parallel-size 2 \
--data-parallel-size 2 \
--enable-expert-parallel \ --enable-expert-parallel \
--enable-eplb \
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
--trust-remote-code \ --trust-remote-code \
--max-model-len 2048 \ --max-model-len 2048 \
--gpu-memory-utilization 0.9 \
--port $PORT & --port $PORT &
SERVER_PID=$! SERVER_PID=$!
wait_for_server $PORT wait_for_server $PORT

View File

@@ -1,74 +0,0 @@
#!/usr/bin/env bash
set -euxo pipefail
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] [DATA_PARALLEL_SIZE] [TENSOR_PARALLEL_SIZE]
THRESHOLD=${1:-0.8}
NUM_Q=${2:-1319}
PORT=${3:-8020}
DATA_PARALLEL_SIZE=${4:-2}
TENSOR_PARALLEL_SIZE=${5:-2}
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
mkdir -p "${OUT_DIR}"
wait_for_server() {
local port=$1
timeout 600 bash -c '
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
sleep 1
done'
}
MODEL="QWen/Qwen3-30B-A3B-FP8"
# Set BACKENDS based on platform
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
# ROCm platform
BACKENDS=("allgather_reducescatter")
# Disable MOE padding for ROCm since it is causing eplb to fail
export VLLM_ROCM_MOE_PADDING=0
else
# Non-ROCm platform (CUDA/other)
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
fi
cleanup() {
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
kill "${SERVER_PID}" 2>/dev/null || true
for _ in {1..20}; do
kill -0 "${SERVER_PID}" 2>/dev/null || break
sleep 0.5
done
kill -9 "${SERVER_PID}" 2>/dev/null || true
fi
}
trap cleanup EXIT
for BACK in "${BACKENDS[@]}"; do
VLLM_DEEP_GEMM_WARMUP=skip \
VLLM_ALL2ALL_BACKEND=$BACK \
vllm serve "$MODEL" \
--enforce-eager \
--enable-eplb \
--eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
--tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
--data-parallel-size ${DATA_PARALLEL_SIZE} \
--enable-expert-parallel \
--trust-remote-code \
--max-model-len 2048 \
--port $PORT &
SERVER_PID=$!
wait_for_server $PORT
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
python3 - <<PY
import json; acc=json.load(open('${OUT}'))['accuracy']
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
PY
cleanup
SERVER_PID=
sleep 1
PORT=$((PORT+1))
done

View File

@@ -2,28 +2,6 @@
set -ex set -ex
# ======== part 0: setup ========
BUCKET="vllm-wheels"
INDICES_OUTPUT_DIR="indices"
DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
PYTHON=${PYTHON_PROG:=python3} # try to read from env var, otherwise use python3
SUBPATH=$BUILDKITE_COMMIT
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
# detect if python3.10+ is available
has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)")
if [[ "$has_new_python" -eq 0 ]]; then
# use new python from docker
docker pull python:3-slim
PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3"
fi
echo "Using python interpreter: $PYTHON"
echo "Python version: $($PYTHON --version)"
# ========= part 1: collect, rename & upload the wheel ==========
# Assume wheels are in artifacts/dist/*.whl # Assume wheels are in artifacts/dist/*.whl
wheel_files=(artifacts/dist/*.whl) wheel_files=(artifacts/dist/*.whl)
@@ -32,76 +10,74 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}" echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
exit 1 exit 1
fi fi
# Get the single wheel file
wheel="${wheel_files[0]}" wheel="${wheel_files[0]}"
# default build image uses ubuntu 20.04, which corresponds to manylinux_2_31 # Detect architecture and rename 'linux' to appropriate manylinux version
# we also accept params as manylinux tag arch=$(uname -m)
# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels if [[ $arch == "x86_64" ]]; then
manylinux_version="${1:-manylinux_2_31}" manylinux_version="manylinux1"
elif [[ $arch == "aarch64" ]]; then
manylinux_version="manylinux2014"
else
echo "Warning: Unknown architecture $arch, using manylinux1 as default"
manylinux_version="manylinux1"
fi
# Rename 'linux' to the appropriate manylinux version in the wheel filename # Rename 'linux' to the appropriate manylinux version in the wheel filename
if [[ "$wheel" != *"linux"* ]]; then
echo "Error: Wheel filename does not contain 'linux': $wheel"
exit 1
fi
new_wheel="${wheel/linux/$manylinux_version}" new_wheel="${wheel/linux/$manylinux_version}"
mv -- "$wheel" "$new_wheel" mv -- "$wheel" "$new_wheel"
wheel="$new_wheel" wheel="$new_wheel"
echo "Renamed wheel to: $wheel"
# Extract the version from the wheel # Extract the version from the wheel
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2) version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
echo "Version in wheel: $version" echo "Version: $version"
pure_version="${version%%+*}"
echo "Pure version (without variant): $pure_version"
# copy wheel to its own bucket normal_wheel="$wheel" # Save the original wheel filename
aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
# ========= part 2: generate and upload indices ========== # If the version contains "dev", rename it to v1.0.0.dev for consistency
# generate indices for all existing wheels in the commit directory if [[ $version == *dev* ]]; then
# this script might be run multiple times if there are multiple variants being built suffix="${version##*.}"
# so we need to guarantee there is little chance for "TOCTOU" issues if [[ $suffix == cu* ]]; then
# i.e., one process is generating indices while another is uploading a new wheel new_version="1.0.0.dev+${suffix}"
# so we need to ensure no time-consuming operations happen below else
new_version="1.0.0.dev"
fi
new_wheel="${wheel/$version/$new_version}"
# use cp to keep both files in the artifacts directory
cp -- "$wheel" "$new_wheel"
wheel="$new_wheel"
version="$new_version"
fi
# list all wheels in the commit directory # Upload the wheel to S3
echo "Existing wheels on S3:" python3 .buildkite/generate_index.py --wheel "$normal_wheel"
aws s3 ls "$S3_COMMIT_PREFIX"
obj_json="objects.json"
aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
mkdir -p "$INDICES_OUTPUT_DIR"
# call script to generate indicies for all existing wheels # generate index for this commit
# this indices have relative paths that could work as long as it is next to the wheel directory in s3 aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
# i.e., the wheels are always in s3://vllm-wheels/<commit>/ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then if [[ $normal_wheel == *"cu129"* ]]; then
alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS" # only upload index.html for cu129 wheels (default wheels) as it
# is available on both x86 and arm64
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
else else
alias_arg="" echo "Skipping index files for non-cu129 wheels"
fi fi
# HACK: we do not need regex module here, but it is required by pre-commit hook # generate index for nightly
# To avoid any external dependency, we simply replace it back to the stdlib re module aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg
# copy indices to /<commit>/ unconditionally if [[ $normal_wheel == *"cu129"* ]]; then
echo "Uploading indices to $S3_COMMIT_PREFIX" # only upload index.html for cu129 wheels (default wheels) as it
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX" # is available on both x86 and arm64
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
# copy to /nightly/ only if it is on the main branch and not a PR else
if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then echo "Skipping index files for non-cu129 wheels"
echo "Uploading indices to overwrite /nightly/"
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
fi fi
# re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
if [[ "$version" != *"dev"* ]]; then aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
echo "Re-generating indices for /$pure_version/"
rm -rf "$INDICES_OUTPUT_DIR/*"
mkdir -p "$INDICES_OUTPUT_DIR"
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
fi

View File

@@ -39,9 +39,9 @@ steps:
# if this test fails, it means the nightly torch version is not compatible with some # if this test fails, it means the nightly torch version is not compatible with some
# of the dependencies. Please check the error message and add the package to whitelist # of the dependencies. Please check the error message and add the package to whitelist
# in /vllm/tools/pre_commit/generate_nightly_torch_test.py # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
mirror_hardwares: [amdexperimental, amdproduction, amdtentative] mirror_hardwares: [amdexperimental]
agent_pool: mi325_1 agent_pool: mi325_1
grade: Blocking # grade: Blocking
soft_fail: true soft_fail: true
source_file_dependencies: source_file_dependencies:
- requirements/nightly_torch_test.txt - requirements/nightly_torch_test.txt
@@ -50,9 +50,9 @@ steps:
- label: Async Engine, Inputs, Utils, Worker Test # 10min - label: Async Engine, Inputs, Utils, Worker Test # 10min
timeout_in_minutes: 15 timeout_in_minutes: 15
mirror_hardwares: [amdexperimental, amdproduction, amdtentative] mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1 agent_pool: mi325_1
grade: Blocking # grade: Blocking
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/multimodal - tests/multimodal
@@ -61,31 +61,25 @@ steps:
- pytest -v -s -m 'not cpu_test' multimodal - pytest -v -s -m 'not cpu_test' multimodal
- pytest -v -s utils_ - pytest -v -s utils_
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min - label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
timeout_in_minutes: 30 timeout_in_minutes: 10
mirror_hardwares: [amdexperimental, amdproduction, amdtentative] mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1 agent_pool: mi325_1
grade: Blocking # grade: Blocking
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/test_inputs.py - tests/test_inputs.py
- tests/test_outputs.py - tests/test_outputs.py
- tests/multimodal - tests/multimodal
- tests/standalone_tests/lazy_imports.py - tests/standalone_tests/lazy_imports.py
- tests/tokenizers_
- tests/tool_parsers
- tests/transformers_utils - tests/transformers_utils
- tests/config
no_gpu: true no_gpu: true
commands: commands:
- python3 standalone_tests/lazy_imports.py - python3 standalone_tests/lazy_imports.py
- pytest -v -s test_inputs.py - pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py - pytest -v -s test_outputs.py
- pytest -v -s -m 'cpu_test' multimodal - pytest -v -s -m 'cpu_test' multimodal
- pytest -v -s tokenizers_
- pytest -v -s tool_parsers
- pytest -v -s transformers_utils - pytest -v -s transformers_utils
- pytest -v -s config
- label: Python-only Installation Test # 10min - label: Python-only Installation Test # 10min
timeout_in_minutes: 20 timeout_in_minutes: 20
@@ -117,9 +111,9 @@ steps:
- pytest -v -s basic_correctness/test_cpu_offload.py - pytest -v -s basic_correctness/test_cpu_offload.py
- label: Entrypoints Unit Tests # 5min - label: Entrypoints Unit Tests # 5min
mirror_hardwares: [amdexperimental, amdproduction, amdtentative] mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1 agent_pool: mi325_1
grade: Blocking # grade: Blocking
timeout_in_minutes: 10 timeout_in_minutes: 10
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
fast_check: true fast_check: true
@@ -193,7 +187,7 @@ steps:
- tests/distributed/test_utils - tests/distributed/test_utils
- tests/distributed/test_pynccl - tests/distributed/test_pynccl
- tests/distributed/test_events - tests/distributed/test_events
- tests/compile/fullgraph/test_basic_correctness.py - tests/compile/test_basic_correctness
- examples/offline_inference/rlhf.py - examples/offline_inference/rlhf.py
- examples/offline_inference/rlhf_colocate.py - examples/offline_inference/rlhf_colocate.py
- tests/examples/offline_inference/data_parallel.py - tests/examples/offline_inference/data_parallel.py
@@ -216,13 +210,12 @@ steps:
# test with internal dp # test with internal dp
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
- pytest -v -s distributed/test_utils.py - pytest -v -s distributed/test_utils.py
- pytest -v -s compile/fullgraph/test_basic_correctness.py - pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_pynccl.py
- pytest -v -s distributed/test_events.py - pytest -v -s distributed/test_events.py
- pytest -v -s distributed/test_symm_mem_allreduce.py - pytest -v -s distributed/test_symm_mem_allreduce.py
@@ -255,9 +248,9 @@ steps:
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
- label: EPLB Algorithm Test # 5min - label: EPLB Algorithm Test # 5min
mirror_hardwares: [amdexperimental, amdproduction, amdtentative] mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1 agent_pool: mi325_1
grade: Blocking # grade: Blocking
timeout_in_minutes: 15 timeout_in_minutes: 15
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
source_file_dependencies: source_file_dependencies:
@@ -313,25 +306,28 @@ steps:
- pytest -v -s test_regression.py - pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional working_dir: "/vllm-workspace/tests" # optional
- label: Engine Test # 9min - label: Engine Test # 25min
timeout_in_minutes: 15 timeout_in_minutes: 40
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking # grade: Blocking
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/engine - tests/engine
- tests/tokenization
- tests/test_sequence - tests/test_sequence
- tests/test_config - tests/test_config
- tests/test_logger - tests/test_logger
- tests/test_vllm_port - tests/test_vllm_port
commands: commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
# OOM in the CI unless we run this separately
- pytest -v -s tokenization
- label: V1 Test e2e + engine # 65min - label: V1 Test e2e + engine # 30min
timeout_in_minutes: 90 timeout_in_minutes: 45
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
agent_pool: mi325_4 agent_pool: mi325_1
# grade: Blocking # grade: Blocking
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
@@ -344,9 +340,9 @@ steps:
- label: V1 Test entrypoints # 35min - label: V1 Test entrypoints # 35min
timeout_in_minutes: 50 timeout_in_minutes: 50
mirror_hardwares: [amdexperimental, amdproduction, amdtentative] mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1 agent_pool: mi325_1
grade: Blocking # grade: Blocking
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/v1 - tests/v1
@@ -394,34 +390,10 @@ steps:
commands: commands:
- pytest -v -s v1/attention - pytest -v -s v1/attention
- label: Batch Invariance Tests (H100) # 10min
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
timeout_in_minutes: 25
gpu: h100
source_file_dependencies:
- vllm/v1/attention
- vllm/model_executor/layers
- tests/v1/determinism/
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pip install pytest-timeout pytest-forked
- pytest -v -s v1/determinism/test_batch_invariance.py
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
- label: V1 Test attention (B200) # 10min
timeout_in_minutes: 30
gpu: b200
source_file_dependencies:
- vllm/v1/attention
- tests/v1/attention
commands:
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
- label: V1 Test others (CPU) # 5 mins - label: V1 Test others (CPU) # 5 mins
mirror_hardwares: [amdexperimental, amdproduction, amdtentative] mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1 agent_pool: mi325_1
grade: Blocking # grade: Blocking
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/v1 - tests/v1
@@ -437,34 +409,29 @@ steps:
- label: Examples Test # 30min - label: Examples Test # 30min
timeout_in_minutes: 45 timeout_in_minutes: 45
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking # grade: Blocking
working_dir: "/vllm-workspace/examples" working_dir: "/vllm-workspace/examples"
source_file_dependencies: source_file_dependencies:
- vllm/entrypoints - vllm/entrypoints
- vllm/multimodal
- examples/ - examples/
commands: commands:
- pip install tensorizer # for tensorizer test - pip install tensorizer # for tensorizer test
# for basic
- python3 offline_inference/basic/chat.py
- python3 offline_inference/basic/generate.py --model facebook/opt-125m - python3 offline_inference/basic/generate.py --model facebook/opt-125m
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- python3 offline_inference/basic/chat.py
- python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py
- python3 offline_inference/audio_language.py --seed 0
- python3 offline_inference/vision_language.py --seed 0
- python3 offline_inference/vision_language_pooling.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- python3 offline_inference/basic/classify.py - python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py - python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py - python3 offline_inference/basic/score.py
# for multi-modal models
- python3 offline_inference/audio_language.py --seed 0
- python3 offline_inference/vision_language.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
# for pooling models
- python3 pooling/pooling/vision_language_pooling.py --seed 0
# for features demo
- python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
@@ -518,7 +485,7 @@ steps:
- label: PyTorch Compilation Unit Tests # 15min - label: PyTorch Compilation Unit Tests # 15min
timeout_in_minutes: 30 timeout_in_minutes: 30
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking # grade: Blocking
torch_nightly: true torch_nightly: true
@@ -526,16 +493,21 @@ steps:
- vllm/ - vllm/
- tests/compile - tests/compile
commands: commands:
# Run unit tests defined directly under compile/, - pytest -v -s compile/test_pass_manager.py
# not including subdirectories, which are usually heavier - pytest -v -s compile/test_fusion.py
# tests covered elsewhere. - pytest -v -s compile/test_fusion_attn.py
# Use `find` to launch multiple instances of pytest so that - pytest -v -s compile/test_functionalization.py
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965 - pytest -v -s compile/test_silu_mul_quant_fusion.py
- "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" # - pytest -v -s compile/test_sequence_parallelism.py
# - pytest -v -s compile/test_async_tp.py
- pytest -v -s compile/test_fusion_all_reduce.py
- pytest -v -s compile/test_decorator.py
- pytest -v -s compile/test_noop_elimination.py
- pytest -v -s compile/test_aot_compile.py
- label: PyTorch Fullgraph Smoke Test # 15min - label: PyTorch Fullgraph Smoke Test # 15min
timeout_in_minutes: 30 timeout_in_minutes: 30
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking # grade: Blocking
torch_nightly: true torch_nightly: true
@@ -543,11 +515,9 @@ steps:
- vllm/ - vllm/
- tests/compile - tests/compile
commands: commands:
# Run smoke tests under fullgraph directory, except test_full_graph.py - pytest -v -s compile/test_basic_correctness.py
# as it is a heavy test that is covered in other steps. - pytest -v -s compile/test_multimodal_compile.py
# Use `find` to launch multiple instances of pytest so that - pytest -v -s compile/piecewise/
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
- label: PyTorch Fullgraph Test # 27min - label: PyTorch Fullgraph Test # 27min
timeout_in_minutes: 40 timeout_in_minutes: 40
@@ -559,10 +529,10 @@ steps:
- vllm/ - vllm/
- tests/compile - tests/compile
commands: commands:
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
# Limit to no custom ops to reduce running time # Limit to no custom ops to reduce running time
# Wrap with quotes to escape yaml and avoid starting -k string with a - # Wrap with quotes to escape yaml and avoid starting -k string with a -
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'" - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
- label: Cudagraph test - label: Cudagraph test
timeout_in_minutes: 20 timeout_in_minutes: 20
@@ -591,7 +561,7 @@ steps:
- label: Kernels Attention Test %N # 23min - label: Kernels Attention Test %N # 23min
timeout_in_minutes: 35 timeout_in_minutes: 35
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental]
agent_pool: mi325_8 agent_pool: mi325_8
# grade: Blocking # grade: Blocking
source_file_dependencies: source_file_dependencies:
@@ -618,7 +588,7 @@ steps:
- label: Kernels MoE Test %N # 40min - label: Kernels MoE Test %N # 40min
timeout_in_minutes: 60 timeout_in_minutes: 60
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental]
agent_pool: mi325_8 agent_pool: mi325_8
# grade: Blocking # grade: Blocking
source_file_dependencies: source_file_dependencies:
@@ -645,26 +615,6 @@ steps:
commands: commands:
- pytest -v -s kernels/mamba - pytest -v -s kernels/mamba
- label: Kernels DeepGEMM Test (H100) # Nvidia-centric
# Not replicating for CUTLAS & CuTe
timeout_in_minutes: 45
gpu: h100
num_gpus: 1
source_file_dependencies:
- tools/install_deepgemm.sh
- vllm/utils/deep_gemm.py
- vllm/model_executor/layers/fused_moe
- vllm/model_executor/layers/quantization
- tests/kernels/quantization/test_block_fp8.py
- tests/kernels/moe/test_deepgemm.py
- tests/kernels/moe/test_batched_deepgemm.py
- tests/kernels/attention/test_deepgemm_attention.py
commands:
- pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
- pytest -v -s kernels/moe/test_deepgemm.py
- pytest -v -s kernels/moe/test_batched_deepgemm.py
- pytest -v -s kernels/attention/test_deepgemm_attention.py
- label: Model Executor Test # 23min - label: Model Executor Test # 23min
timeout_in_minutes: 35 timeout_in_minutes: 35
torch_nightly: true torch_nightly: true
@@ -723,18 +673,16 @@ steps:
# we can only upgrade after this is resolved # we can only upgrade after this is resolved
# TODO(jerryzh168): resolve the above comment # TODO(jerryzh168): resolve the above comment
- uv pip install --system torchao==0.13.0 - uv pip install --system torchao==0.13.0
- uv pip install --system conch-triton-kernels
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
- label: LM Eval Small Models # 53min - label: LM Eval Small Models # 15min
timeout_in_minutes: 75 timeout_in_minutes: 20
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking # grade: Blocking
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
autorun_on_main: true
commands: commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
@@ -747,9 +695,9 @@ steps:
- csrc/ - csrc/
- vllm/entrypoints/openai/ - vllm/entrypoints/openai/
- vllm/model_executor/models/whisper.py - vllm/model_executor/models/whisper.py
commands: # LMEval+Transcription WER check commands: # LMEval
# Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442 # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
- pytest -s entrypoints/openai/correctness/ - pytest -s entrypoints/openai/correctness/ --ignore entrypoints/openai/correctness/test_transcription_api_correctness.py
- label: OpenAI-Compatible Tool Use # 23 min - label: OpenAI-Compatible Tool Use # 23 min
timeout_in_minutes: 35 timeout_in_minutes: 35
@@ -761,7 +709,19 @@ steps:
- vllm/ - vllm/
- tests/tool_use - tests/tool_use
commands: commands:
- pytest -v -s tool_use - pytest -v -s -m 'not cpu_test' tool_use
- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
timeout_in_minutes: 10
source_file_dependencies:
- vllm/
- tests/tool_use
no_gpu: true
commands:
- pytest -v -s -m 'cpu_test' tool_use
##### models test ##### ##### models test #####
@@ -786,7 +746,6 @@ steps:
torch_nightly: true torch_nightly: true
source_file_dependencies: source_file_dependencies:
- vllm/model_executor/models/ - vllm/model_executor/models/
- vllm/transformers_utils/
- tests/models/test_initialization.py - tests/models/test_initialization.py
commands: commands:
# Only when vLLM model source is modified - test initialization of a large # Only when vLLM model source is modified - test initialization of a large
@@ -932,18 +891,6 @@ steps:
commands: commands:
- pytest -v -s models/language/pooling_mteb_test - pytest -v -s models/language/pooling_mteb_test
- label: Multi-Modal Processor Test (CPU)
timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
source_file_dependencies:
- vllm/
- tests/models/multimodal
no_gpu: true
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
- label: Multi-Modal Processor Test # 44min - label: Multi-Modal Processor Test # 44min
timeout_in_minutes: 60 timeout_in_minutes: 60
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
@@ -971,8 +918,8 @@ steps:
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- label: Multi-Modal Accuracy Eval (Small Models) # 150min - 180min - label: Multi-Modal Accuracy Eval (Small Models) # 10min
timeout_in_minutes: 180 timeout_in_minutes: 70
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking # grade: Blocking
@@ -984,8 +931,7 @@ steps:
commands: commands:
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
- label: Multi-Modal Models Test (Extended) 1 # 60min - label: Multi-Modal Models Test (Extended) 1
timeout_in_minutes: 120
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking # grade: Blocking
@@ -1009,8 +955,7 @@ steps:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
- label: Multi-Modal Models Test (Extended) 3 # 75min - label: Multi-Modal Models Test (Extended) 3
timeout_in_minutes: 150
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking # grade: Blocking
@@ -1053,12 +998,12 @@ steps:
optional: true optional: true
commands: commands:
- pip install --upgrade git+https://github.com/huggingface/transformers - pip install --upgrade git+https://github.com/huggingface/transformers
- pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)' - pytest -v -s tests/models/test_initialization.py
- pytest -v -s tests/models/test_transformers.py - pytest -v -s tests/models/test_transformers.py
# - pytest -v -s tests/models/multimodal/processing/ - pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)' - pytest -v -s tests/models/multimodal/test_mapping.py
- python3 examples/offline_inference/basic/chat.py - python3 examples/offline_inference/basic/chat.py
# - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
# Whisper needs spawn method to avoid deadlock # Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
@@ -1102,9 +1047,8 @@ steps:
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
- pytest -v -s tests/kernels/moe/test_flashinfer.py - pytest -v -s tests/kernels/moe/test_flashinfer.py
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
- label: Blackwell Fusion and Compile Tests # 30 min - label: Blackwell Fusion Tests # 30 min
timeout_in_minutes: 40 timeout_in_minutes: 40
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
gpu: b200 gpu: b200
@@ -1112,29 +1056,20 @@ steps:
- csrc/quantization/fp4/ - csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py - vllm/v1/attention/backends/flashinfer.py
- vllm/v1/worker/
- vllm/v1/cudagraph_dispatcher.py
- vllm/compilation/ - vllm/compilation/
# can affect pattern matching # can affect pattern matching
- vllm/model_executor/layers/layernorm.py - vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py - vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py - vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/test_fusion_attn.py
- tests/compile/test_silu_mul_quant_fusion.py
- tests/compile/distributed/test_fusion_all_reduce.py
- tests/compile/distributed/test_fusions_e2e.py
- tests/compile/fullgraph/test_full_graph.py
commands: commands:
- nvidia-smi - nvidia-smi
- pytest -v -s tests/compile/test_fusion_attn.py - pytest -v -s tests/compile/test_fusion_attn.py
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
# this runner has 2 GPUs available even though num_gpus=2 is not set # this runner has 2 GPUs available even though num_gpus=2 is not set
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py - pytest -v -s tests/compile/test_fusion_all_reduce.py
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
# Wrap with quotes to escape yaml # Wrap with quotes to escape yaml
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
- label: Blackwell Fusion E2E Tests # 30 min - label: Blackwell Fusion E2E Tests # 30 min
timeout_in_minutes: 40 timeout_in_minutes: 40
@@ -1151,16 +1086,20 @@ steps:
- vllm/model_executor/layers/layernorm.py - vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py - vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py - vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/distributed/test_fusions_e2e.py - tests/compile/test_fusions_e2e.py
- tests/compile/test_full_graph.py
commands: commands:
- nvidia-smi - nvidia-smi
# Run all e2e fusion tests # Run all e2e fusion tests
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py - pytest -v -s tests/compile/test_fusions_e2e.py
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
- pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
- label: Blackwell GPT-OSS Eval - label: ROCm GPT-OSS Eval
timeout_in_minutes: 60 timeout_in_minutes: 60
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
gpu: b200 agent_pool: mi325_1
mirror_hardwares: [amdproduction]
optional: true # run on nightlies optional: true # run on nightlies
source_file_dependencies: source_file_dependencies:
- tests/evals/gpt_oss - tests/evals/gpt_oss
@@ -1169,7 +1108,7 @@ steps:
- vllm/v1/attention/backends/flashinfer.py - vllm/v1/attention/backends/flashinfer.py
commands: commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5' - uv pip install --system 'gpt-oss[eval]==0.0.5'
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
- label: Blackwell Quantized MoE Test - label: Blackwell Quantized MoE Test
timeout_in_minutes: 60 timeout_in_minutes: 60
@@ -1259,7 +1198,7 @@ steps:
- vllm/worker/worker_base.py - vllm/worker/worker_base.py
- vllm/v1/engine/ - vllm/v1/engine/
- vllm/v1/worker/ - vllm/v1/worker/
- tests/compile/fullgraph/test_basic_correctness.py - tests/compile/test_basic_correctness.py
- tests/compile/test_wrapper.py - tests/compile/test_wrapper.py
- tests/distributed/ - tests/distributed/
- tests/entrypoints/llm/test_collective_rpc.py - tests/entrypoints/llm/test_collective_rpc.py
@@ -1269,11 +1208,10 @@ steps:
- tests/v1/worker/test_worker_memory_snapshot.py - tests/v1/worker/test_worker_memory_snapshot.py
commands: commands:
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
- pytest -v -s entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm/test_collective_rpc.py
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py - pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py - pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
@@ -1305,7 +1243,7 @@ steps:
- label: Plugin Tests (2 GPUs) # 40min - label: Plugin Tests (2 GPUs) # 40min
timeout_in_minutes: 60 timeout_in_minutes: 60
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental]
agent_pool: mi325_2 agent_pool: mi325_2
# grade: Blocking # grade: Blocking
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
@@ -1373,27 +1311,11 @@ steps:
- pytest -v -s -x lora/test_llama_tp.py - pytest -v -s -x lora/test_llama_tp.py
- pytest -v -s -x lora/test_llm_with_multi_loras.py - pytest -v -s -x lora/test_llm_with_multi_loras.py
- pytest -v -s -x lora/test_olmoe_tp.py - pytest -v -s -x lora/test_olmoe_tp.py
- pytest -v -s -x lora/test_gptoss_tp.py
# Disabled for now because MXFP4 backend on non-cuda platform
# doesn't support LoRA yet
#- pytest -v -s -x lora/test_gptoss_tp.py
- label: Weight Loading Multiple GPU Test # 33min - label: Weight Loading Multiple GPU Test # 33min
timeout_in_minutes: 45 timeout_in_minutes: 45
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_2
# grade: Blocking
working_dir: "/vllm-workspace/tests"
num_gpus: 2
optional: true
source_file_dependencies:
- vllm/
- tests/weight_loading
commands:
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt
- label: Weight Loading Multiple GPU Test - Large Models # optional
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
agent_pool: mi325_2 agent_pool: mi325_2
# grade: Blocking # grade: Blocking
@@ -1404,7 +1326,21 @@ steps:
- vllm/ - vllm/
- tests/weight_loading - tests/weight_loading
commands: commands:
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
- label: Weight Loading Multiple GPU Test - Large Models # optional
mirror_hardwares: [amdexperimental]
agent_pool: mi325_2
# grade: Blocking
working_dir: "/vllm-workspace/tests"
num_gpus: 2
gpu: a100
optional: true
source_file_dependencies:
- vllm/
- tests/weight_loading
commands:
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
- label: NixlConnector PD accuracy tests (Distributed) # 30min - label: NixlConnector PD accuracy tests (Distributed) # 30min
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
@@ -1440,13 +1376,12 @@ steps:
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
- pytest -v -s -x lora/test_mixtral.py - pytest -v -s -x lora/test_mixtral.py
- label: LM Eval Large Models # optional - label: LM Eval Large Models # optional
gpu: a100 mirror_hardwares: [amdexperimental, amdproduction]
optional: true
mirror_hardwares: [amdexperimental]
agent_pool: mi325_4 agent_pool: mi325_4
# grade: Blocking # grade: Blocking
gpu: a100
optional: true
num_gpus: 4 num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies: source_file_dependencies:
@@ -1458,11 +1393,11 @@ steps:
##### H100 test ##### ##### H100 test #####
- label: LM Eval Large Models (H100) # optional - label: LM Eval Large Models (H100) # optional
gpu: h100 mirror_hardwares: [amdexperimental, amdproduction]
optional: true
mirror_hardwares: [amdexperimental]
agent_pool: mi325_4 agent_pool: mi325_4
# grade: Blocking # grade: Blocking
gpu: h100
optional: true
num_gpus: 4 num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies: source_file_dependencies:
@@ -1472,7 +1407,6 @@ steps:
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100 - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
##### H200 test ##### ##### H200 test #####
- label: Distributed Tests (H200) # optional - label: Distributed Tests (H200) # optional
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
@@ -1483,14 +1417,12 @@ steps:
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
num_gpus: 2 num_gpus: 2
commands: commands:
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py - pytest -v -s tests/compile/test_async_tp.py
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py - pytest -v -s tests/compile/test_sequence_parallelism.py
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py - pytest -v -s tests/compile/test_fusion_all_reduce.py
#- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py
- HIP_VISIBLE_DEVICES=0,1 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
- pytest -v -s tests/v1/distributed/test_dbo.py - pytest -v -s tests/v1/distributed/test_dbo.py
##### B200 test ##### ##### B200 test #####
@@ -1504,57 +1436,6 @@ steps:
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
- pytest -v -s tests/v1/distributed/test_dbo.py - pytest -v -s tests/v1/distributed/test_dbo.py
##### E2E Eval Tests #####
- label: LM Eval Small Models (1 Card) # 15min
timeout_in_minutes: 20
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
- label: LM Eval Large Models (4 Card)
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4
# grade: Blocking
gpu: a100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
- label: ROCm LM Eval Large Models (8 Card)
mirror_hardwares: [amdproduction]
agent_pool: mi325_8
num_gpus: 8
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8
- label: ROCm GPT-OSS Eval
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
agent_pool: mi325_1
mirror_hardwares: [amdexperimental, amdproduction]
optional: true # run on nightlies
source_file_dependencies:
- tests/evals/gpt_oss
- vllm/model_executor/models/gpt_oss.py
- vllm/model_executor/layers/quantization/mxfp4.py
- vllm/v1/attention/backends/flashinfer.py
commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
##### RL Integration Tests ##### ##### RL Integration Tests #####
- label: Prime-RL Integration Test # 15min - label: Prime-RL Integration Test # 15min
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
@@ -1569,8 +1450,9 @@ steps:
- .buildkite/scripts/run-prime-rl-test.sh - .buildkite/scripts/run-prime-rl-test.sh
commands: commands:
- bash .buildkite/scripts/run-prime-rl-test.sh - bash .buildkite/scripts/run-prime-rl-test.sh
- label: DeepSeek V2-Lite Accuracy - label: DeepSeek V2-Lite Accuracy
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental]
agent_pool: mi325_4 agent_pool: mi325_4
# grade: Blocking # grade: Blocking
timeout_in_minutes: 60 timeout_in_minutes: 60
@@ -1581,8 +1463,8 @@ steps:
commands: commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010 - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
- label: Qwen3-30B-A3B-FP8-block Accuracy (H100) - label: Qwen3-30B-A3B-FP8-block Accuracy
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental]
agent_pool: mi325_4 agent_pool: mi325_4
# grade: Blocking # grade: Blocking
timeout_in_minutes: 60 timeout_in_minutes: 60
@@ -1591,36 +1473,4 @@ steps:
num_gpus: 4 num_gpus: 4
working_dir: "/vllm-workspace" working_dir: "/vllm-workspace"
commands: commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
timeout_in_minutes: 60
gpu: b200
optional: true
num_gpus: 2
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
- label: DeepSeek V2-Lite Async EPLB Accuracy
timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
agent_pool: mi325_4
# grade: Blocking
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
agent_pool: mi325_4
# grade: Blocking
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040

View File

@@ -57,16 +57,14 @@ steps:
- pytest -v -s -m 'not cpu_test' multimodal - pytest -v -s -m 'not cpu_test' multimodal
- pytest -v -s utils_ - pytest -v -s utils_
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min - label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins
timeout_in_minutes: 30 timeout_in_minutes: 10
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/test_inputs.py - tests/test_inputs.py
- tests/test_outputs.py - tests/test_outputs.py
- tests/multimodal - tests/multimodal
- tests/standalone_tests/lazy_imports.py - tests/standalone_tests/lazy_imports.py
- tests/tokenizers_
- tests/tool_parsers
- tests/transformers_utils - tests/transformers_utils
- tests/config - tests/config
no_gpu: true no_gpu: true
@@ -75,8 +73,6 @@ steps:
- pytest -v -s test_inputs.py - pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py - pytest -v -s test_outputs.py
- pytest -v -s -m 'cpu_test' multimodal - pytest -v -s -m 'cpu_test' multimodal
- pytest -v -s tokenizers_
- pytest -v -s tool_parsers
- pytest -v -s transformers_utils - pytest -v -s transformers_utils
- pytest -v -s config - pytest -v -s config
@@ -171,7 +167,7 @@ steps:
- tests/distributed/test_utils - tests/distributed/test_utils
- tests/distributed/test_pynccl - tests/distributed/test_pynccl
- tests/distributed/test_events - tests/distributed/test_events
- tests/compile/fullgraph/test_basic_correctness.py - tests/compile/test_basic_correctness
- examples/offline_inference/rlhf.py - examples/offline_inference/rlhf.py
- examples/offline_inference/rlhf_colocate.py - examples/offline_inference/rlhf_colocate.py
- tests/examples/offline_inference/data_parallel.py - tests/examples/offline_inference/data_parallel.py
@@ -196,13 +192,12 @@ steps:
# test with internal dp # test with internal dp
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
- pytest -v -s distributed/test_utils.py - pytest -v -s distributed/test_utils.py
- pytest -v -s compile/fullgraph/test_basic_correctness.py - pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_pynccl.py
- pytest -v -s distributed/test_events.py - pytest -v -s distributed/test_events.py
- pytest -v -s distributed/test_symm_mem_allreduce.py - pytest -v -s distributed/test_symm_mem_allreduce.py
@@ -280,18 +275,21 @@ steps:
- pytest -v -s test_regression.py - pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional working_dir: "/vllm-workspace/tests" # optional
- label: Engine Test # 9min - label: Engine Test # 25min
timeout_in_minutes: 15 timeout_in_minutes: 40
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/engine - tests/engine
- tests/tokenization
- tests/test_sequence - tests/test_sequence
- tests/test_config - tests/test_config
- tests/test_logger - tests/test_logger
- tests/test_vllm_port - tests/test_vllm_port
commands: commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
# OOM in the CI unless we run this separately
- pytest -v -s tokenization
- label: V1 Test e2e + engine # 30min - label: V1 Test e2e + engine # 30min
timeout_in_minutes: 45 timeout_in_minutes: 45
@@ -348,19 +346,6 @@ steps:
commands: commands:
- pytest -v -s v1/attention - pytest -v -s v1/attention
- label: Batch Invariance Tests (H100) # 10min
timeout_in_minutes: 25
gpu: h100
source_file_dependencies:
- vllm/v1/attention
- vllm/model_executor/layers
- tests/v1/determinism/
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pip install pytest-timeout pytest-forked
- pytest -v -s v1/determinism/test_batch_invariance.py
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
- label: V1 Test attention (B200) # 10min - label: V1 Test attention (B200) # 10min
timeout_in_minutes: 30 timeout_in_minutes: 30
gpu: b200 gpu: b200
@@ -390,28 +375,23 @@ steps:
working_dir: "/vllm-workspace/examples" working_dir: "/vllm-workspace/examples"
source_file_dependencies: source_file_dependencies:
- vllm/entrypoints - vllm/entrypoints
- vllm/multimodal
- examples/ - examples/
commands: commands:
- pip install tensorizer # for tensorizer test - pip install tensorizer # for tensorizer test
# for basic
- python3 offline_inference/basic/chat.py
- python3 offline_inference/basic/generate.py --model facebook/opt-125m - python3 offline_inference/basic/generate.py --model facebook/opt-125m
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- python3 offline_inference/basic/chat.py
- python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py
- python3 offline_inference/audio_language.py --seed 0
- python3 offline_inference/vision_language.py --seed 0
- python3 offline_inference/vision_language_pooling.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- python3 offline_inference/basic/classify.py - python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py - python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py - python3 offline_inference/basic/score.py
# for multi-modal models
- python3 offline_inference/audio_language.py --seed 0
- python3 offline_inference/vision_language.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
# for pooling models
- python3 pooling/pooling/vision_language_pooling.py --seed 0
# for features demo
- python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
@@ -465,14 +445,18 @@ steps:
- vllm/ - vllm/
- tests/compile - tests/compile
commands: commands:
# Run unit tests defined directly under compile/, - pytest -v -s compile/test_graph_partition.py
# not including subdirectories, which are usually heavier - pytest -v -s compile/test_config.py
# tests covered elsewhere. - pytest -v -s compile/test_pass_manager.py
# Use `find` to launch multiple instances of pytest so that - pytest -v -s compile/test_fusion.py
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965 - pytest -v -s compile/test_fusion_attn.py
# However, find does not normally propagate error codes, so we combine it with xargs - pytest -v -s compile/test_functionalization.py
# (using -0 for proper path handling) - pytest -v -s compile/test_silu_mul_quant_fusion.py
- "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'" - pytest -v -s compile/test_fusion_all_reduce.py
- pytest -v -s compile/test_decorator.py
- pytest -v -s compile/test_noop_elimination.py
- pytest -v -s compile/test_aot_compile.py
- pytest -v -s compile/test_qk_norm_rope_fusion.py
- label: PyTorch Fullgraph Smoke Test # 15min - label: PyTorch Fullgraph Smoke Test # 15min
timeout_in_minutes: 30 timeout_in_minutes: 30
@@ -482,13 +466,9 @@ steps:
- vllm/ - vllm/
- tests/compile - tests/compile
commands: commands:
# Run smoke tests under fullgraph directory, except test_full_graph.py - pytest -v -s compile/test_basic_correctness.py
# as it is a heavy test that is covered in other steps. - pytest -v -s compile/test_multimodal_compile.py
# Use `find` to launch multiple instances of pytest so that - pytest -v -s compile/piecewise/
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
# However, find does not normally propagate error codes, so we combine it with xargs
# (using -0 for proper path handling)
- "find compile/fullgraph -maxdepth 1 -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
- label: PyTorch Fullgraph Test # 27min - label: PyTorch Fullgraph Test # 27min
timeout_in_minutes: 40 timeout_in_minutes: 40
@@ -499,10 +479,10 @@ steps:
- tests/compile - tests/compile
commands: commands:
# fp8 kv scales not supported on sm89, tested on Blackwell instead # fp8 kv scales not supported on sm89, tested on Blackwell instead
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
# Limit to no custom ops to reduce running time # Limit to no custom ops to reduce running time
# Wrap with quotes to escape yaml and avoid starting -k string with a - # Wrap with quotes to escape yaml and avoid starting -k string with a -
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'" - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
- label: Cudagraph test - label: Cudagraph test
timeout_in_minutes: 20 timeout_in_minutes: 20
@@ -574,25 +554,6 @@ steps:
commands: commands:
- pytest -v -s kernels/mamba - pytest -v -s kernels/mamba
- label: Kernels DeepGEMM Test (H100)
timeout_in_minutes: 45
gpu: h100
num_gpus: 1
source_file_dependencies:
- tools/install_deepgemm.sh
- vllm/utils/deep_gemm.py
- vllm/model_executor/layers/fused_moe
- vllm/model_executor/layers/quantization
- tests/kernels/quantization/test_block_fp8.py
- tests/kernels/moe/test_deepgemm.py
- tests/kernels/moe/test_batched_deepgemm.py
- tests/kernels/attention/test_deepgemm_attention.py
commands:
- pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
- pytest -v -s kernels/moe/test_deepgemm.py
- pytest -v -s kernels/moe/test_batched_deepgemm.py
- pytest -v -s kernels/attention/test_deepgemm_attention.py
- label: Model Executor Test # 23min - label: Model Executor Test # 23min
timeout_in_minutes: 35 timeout_in_minutes: 35
torch_nightly: true torch_nightly: true
@@ -643,7 +604,6 @@ steps:
# we can only upgrade after this is resolved # we can only upgrade after this is resolved
# TODO(jerryzh168): resolve the above comment # TODO(jerryzh168): resolve the above comment
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129 - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
- uv pip install --system conch-triton-kernels
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
- label: LM Eval Small Models # 53min - label: LM Eval Small Models # 53min
@@ -674,7 +634,16 @@ steps:
- vllm/ - vllm/
- tests/tool_use - tests/tool_use
commands: commands:
- pytest -v -s tool_use - pytest -v -s -m 'not cpu_test' tool_use
- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
timeout_in_minutes: 10
source_file_dependencies:
- vllm/
- tests/tool_use
no_gpu: true
commands:
- pytest -v -s -m 'cpu_test' tool_use
##### models test ##### ##### models test #####
@@ -685,7 +654,6 @@ steps:
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/test_initialization.py - tests/models/test_initialization.py
- tests/models/registry.py
commands: commands:
# Run a subset of model initialization tests # Run a subset of model initialization tests
- pytest -v -s models/test_initialization.py::test_can_initialize_small_subset - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
@@ -696,9 +664,7 @@ steps:
torch_nightly: true torch_nightly: true
source_file_dependencies: source_file_dependencies:
- vllm/model_executor/models/ - vllm/model_executor/models/
- vllm/transformers_utils/
- tests/models/test_initialization.py - tests/models/test_initialization.py
- tests/models/registry.py
commands: commands:
# Only when vLLM model source is modified - test initialization of a large # Only when vLLM model source is modified - test initialization of a large
# subset of supported models (the complement of the small subset in the above # subset of supported models (the complement of the small subset in the above
@@ -824,24 +790,14 @@ steps:
commands: commands:
- pytest -v -s models/language/pooling_mteb_test - pytest -v -s models/language/pooling_mteb_test
- label: Multi-Modal Processor Test (CPU) - label: Multi-Modal Processor Test # 44min
timeout_in_minutes: 60
source_file_dependencies:
- vllm/
- tests/models/multimodal
no_gpu: true
commands:
- "pip install git+https://github.com/TIGER-AI-Lab/Mantis.git || echo 'Mantis installation skipped (decord not available on CPU-only environment)'"
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
- label: Multi-Modal Processor Test
timeout_in_minutes: 60 timeout_in_minutes: 60
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/multimodal - tests/models/multimodal
commands: commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/processing/test_tensor_schema.py - pytest -v -s models/multimodal/processing
- label: Multi-Modal Models Test (Standard) # 60min - label: Multi-Modal Models Test (Standard) # 60min
timeout_in_minutes: 80 timeout_in_minutes: 80
@@ -918,15 +874,14 @@ steps:
- label: Transformers Nightly Models Test - label: Transformers Nightly Models Test
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
optional: true optional: true
soft_fail: true
commands: commands:
- pip install --upgrade git+https://github.com/huggingface/transformers - pip install --upgrade git+https://github.com/huggingface/transformers
- pytest -v -s tests/models/test_initialization.py - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
- pytest -v -s tests/models/test_transformers.py - pytest -v -s tests/models/test_transformers.py
- pytest -v -s tests/models/multimodal/processing/ # - pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
- python3 examples/offline_inference/basic/chat.py - python3 examples/offline_inference/basic/chat.py
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
# Whisper needs spawn method to avoid deadlock # Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
@@ -970,7 +925,6 @@ steps:
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
- pytest -v -s tests/kernels/moe/test_flashinfer.py - pytest -v -s tests/kernels/moe/test_flashinfer.py
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
- label: Blackwell Fusion and Compile Tests # 30 min - label: Blackwell Fusion and Compile Tests # 30 min
timeout_in_minutes: 40 timeout_in_minutes: 40
@@ -980,29 +934,22 @@ steps:
- csrc/quantization/fp4/ - csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py - vllm/v1/attention/backends/flashinfer.py
- vllm/v1/worker/
- vllm/v1/cudagraph_dispatcher.py
- vllm/compilation/ - vllm/compilation/
# can affect pattern matching # can affect pattern matching
- vllm/model_executor/layers/layernorm.py - vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py - vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py - vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/test_fusion_attn.py
- tests/compile/test_silu_mul_quant_fusion.py
- tests/compile/distributed/test_fusion_all_reduce.py
- tests/compile/distributed/test_fusions_e2e.py
- tests/compile/fullgraph/test_full_graph.py
commands: commands:
- nvidia-smi - nvidia-smi
- pytest -v -s tests/compile/test_fusion_attn.py - pytest -v -s tests/compile/test_fusion_attn.py
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
# this runner has 2 GPUs available even though num_gpus=2 is not set # this runner has 2 GPUs available even though num_gpus=2 is not set
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py - pytest -v -s tests/compile/test_fusion_all_reduce.py
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
# Wrap with quotes to escape yaml # Wrap with quotes to escape yaml
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
- label: Blackwell Fusion E2E Tests # 30 min - label: Blackwell Fusion E2E Tests # 30 min
timeout_in_minutes: 40 timeout_in_minutes: 40
@@ -1019,11 +966,12 @@ steps:
- vllm/model_executor/layers/layernorm.py - vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py - vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py - vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/distributed/test_fusions_e2e.py - tests/compile/test_fusions_e2e.py
- tests/compile/test_full_graph.py
commands: commands:
- nvidia-smi - nvidia-smi
# Run all e2e fusion tests # Run all e2e fusion tests
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py - pytest -v -s tests/compile/test_fusions_e2e.py
- label: Blackwell GPT-OSS Eval - label: Blackwell GPT-OSS Eval
timeout_in_minutes: 60 timeout_in_minutes: 60
@@ -1121,7 +1069,7 @@ steps:
- vllm/worker/worker_base.py - vllm/worker/worker_base.py
- vllm/v1/engine/ - vllm/v1/engine/
- vllm/v1/worker/ - vllm/v1/worker/
- tests/compile/fullgraph/test_basic_correctness.py - tests/compile/test_basic_correctness.py
- tests/compile/test_wrapper.py - tests/compile/test_wrapper.py
- tests/distributed/ - tests/distributed/
- tests/entrypoints/llm/test_collective_rpc.py - tests/entrypoints/llm/test_collective_rpc.py
@@ -1133,11 +1081,10 @@ steps:
# https://github.com/NVIDIA/nccl/issues/1838 # https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0 - export NCCL_CUMEM_HOST_ENABLE=0
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
- pytest -v -s entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm/test_collective_rpc.py
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py - pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py - pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
@@ -1317,11 +1264,11 @@ steps:
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
num_gpus: 2 num_gpus: 2
commands: commands:
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py - pytest -v -s tests/compile/test_async_tp.py
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py - pytest -v -s tests/compile/test_sequence_parallelism.py
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py - pytest -v -s tests/compile/test_fusion_all_reduce.py
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - "pytest -v -s tests/compile/test_fusions_e2e.py -k 'not Llama-4'"
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
- pytest -v -s tests/v1/distributed/test_dbo.py - pytest -v -s tests/v1/distributed/test_dbo.py
@@ -1341,7 +1288,6 @@ steps:
- label: Prime-RL Integration Test # 15min - label: Prime-RL Integration Test # 15min
timeout_in_minutes: 30 timeout_in_minutes: 30
optional: true optional: true
soft_fail: true
num_gpus: 2 num_gpus: 2
working_dir: "/vllm-workspace" working_dir: "/vllm-workspace"
source_file_dependencies: source_file_dependencies:
@@ -1359,20 +1305,11 @@ steps:
commands: commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010 - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
- label: Qwen3-30B-A3B-FP8-block Accuracy (H100) - label: Qwen3-30B-A3B-FP8-block Accuracy
timeout_in_minutes: 60 timeout_in_minutes: 60
gpu: h100 gpu: h100
optional: true optional: true
num_gpus: 4 num_gpus: 4
working_dir: "/vllm-workspace" working_dir: "/vllm-workspace"
commands: commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
timeout_in_minutes: 60
gpu: b200
optional: true
num_gpus: 2
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1

View File

@@ -1,21 +0,0 @@
group: Attention
depends_on:
- image-build
steps:
- label: V1 attention (H100)
timeout_in_minutes: 30
gpu: h100
source_file_dependencies:
- vllm/v1/attention
- tests/v1/attention
commands:
- pytest -v -s v1/attention
- label: V1 attention (B200)
timeout_in_minutes: 30
gpu: b200
source_file_dependencies:
- vllm/v1/attention
- tests/v1/attention
commands:
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this

View File

@@ -1,16 +0,0 @@
group: Basic Correctness
depends_on:
- image-build
steps:
- label: Basic Correctness
timeout_in_minutes: 30
source_file_dependencies:
- vllm/
- tests/basic_correctness/test_basic_correctness
- tests/basic_correctness/test_cpu_offload
- tests/basic_correctness/test_cumem.py
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s basic_correctness/test_cumem.py
- pytest -v -s basic_correctness/test_basic_correctness.py
- pytest -v -s basic_correctness/test_cpu_offload.py

View File

@@ -1,19 +0,0 @@
group: Benchmarks
depends_on:
- image-build
steps:
- label: Benchmarks
timeout_in_minutes: 20
working_dir: "/vllm-workspace/.buildkite"
source_file_dependencies:
- benchmarks/
commands:
- bash scripts/run-benchmarks.sh
- label: Benchmarks CLI Test
timeout_in_minutes: 20
source_file_dependencies:
- vllm/
- tests/benchmarks/
commands:
- pytest -v -s benchmarks/

View File

@@ -1,57 +0,0 @@
group: Compile
depends_on:
- image-build
steps:
- label: Fusion and Compile Tests (B200)
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
gpu: b200
source_file_dependencies:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/v1/worker/
- vllm/v1/cudagraph_dispatcher.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/test_fusion_attn.py
- tests/compile/test_silu_mul_quant_fusion.py
- tests/compile/distributed/test_fusion_all_reduce.py
- tests/compile/distributed/test_fusions_e2e.py
- tests/compile/fullgraph/test_full_graph.py
commands:
- nvidia-smi
- pytest -v -s tests/compile/test_fusion_attn.py
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
# this runner has 2 GPUs available even though num_gpus=2 is not set
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
# Wrap with quotes to escape yaml
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
- label: Fusion E2E (2 GPUs)(B200)
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
gpu: b200
optional: true
num_gpus: 2
source_file_dependencies:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/distributed/test_fusions_e2e.py
commands:
- nvidia-smi
# Run all e2e fusion tests
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py

View File

@@ -1,22 +0,0 @@
group: CUDA
depends_on:
- image-build
steps:
- label: Platform Tests (CUDA)
timeout_in_minutes: 15
source_file_dependencies:
- vllm/
- tests/cuda
commands:
- pytest -v -s cuda/test_cuda_context.py
- label: Cudagraph
timeout_in_minutes: 20
source_file_dependencies:
- tests/v1/cudagraph
- vllm/v1/cudagraph_dispatcher.py
- vllm/config/compilation.py
- vllm/compilation
commands:
- pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
- pytest -v -s v1/cudagraph/test_cudagraph_mode.py

View File

@@ -1,199 +0,0 @@
group: Distributed
depends_on:
- image-build
steps:
- label: Distributed Comm Ops
timeout_in_minutes: 20
working_dir: "/vllm-workspace/tests"
num_gpus: 2
source_file_dependencies:
- vllm/distributed
- tests/distributed
commands:
- pytest -v -s distributed/test_comm_ops.py
- pytest -v -s distributed/test_shm_broadcast.py
- pytest -v -s distributed/test_shm_buffer.py
- pytest -v -s distributed/test_shm_storage.py
- label: Distributed (2 GPUs)
timeout_in_minutes: 90
working_dir: "/vllm-workspace/tests"
num_gpus: 2
source_file_dependencies:
- vllm/compilation/
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- vllm/worker/worker_base.py
- vllm/v1/engine/
- vllm/v1/worker/
- tests/compile/fullgraph/test_basic_correctness.py
- tests/compile/test_wrapper.py
- tests/distributed/
- tests/entrypoints/llm/test_collective_rpc.py
- tests/v1/distributed
- tests/v1/entrypoints/openai/test_multi_api_servers.py
- tests/v1/shutdown
- tests/v1/worker/test_worker_memory_snapshot.py
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
- pytest -v -s entrypoints/llm/test_collective_rpc.py
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- pytest -v -s distributed/test_sequence_parallel.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
- label: Distributed Tests (4 GPUs)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
- vllm/distributed/
- tests/distributed/test_utils
- tests/distributed/test_pynccl
- tests/distributed/test_events
- tests/compile/fullgraph/test_basic_correctness.py
- examples/offline_inference/rlhf.py
- examples/offline_inference/rlhf_colocate.py
- tests/examples/offline_inference/data_parallel.py
- tests/v1/distributed
- tests/v1/engine/test_engine_core_client.py
- tests/distributed/test_symm_mem_allreduce.py
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
# test with torchrun tp=2 and external_dp=2
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
# test with torchrun tp=2 and pp=2
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
# test with torchrun tp=4 and dp=1
- TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with torchrun tp=2, pp=2 and dp=1
- PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with torchrun tp=1 and dp=4 with ep
- DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with torchrun tp=2 and dp=2 with ep
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with internal dp
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
- pytest -v -s distributed/test_utils.py
- pytest -v -s compile/fullgraph/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py
- pytest -v -s distributed/test_events.py
- pytest -v -s distributed/test_symm_mem_allreduce.py
# TODO: create a dedicated test section for multi-GPU example tests
# when we have multiple distributed example tests
- cd ../examples/offline_inference
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
- label: Distributed Tests (8 GPUs)(H100)
timeout_in_minutes: 10
gpu: h100
num_gpus: 8
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- examples/offline_inference/torchrun_dp_example.py
- vllm/config/parallel.py
- vllm/distributed/
- vllm/v1/engine/llm_engine.py
- vllm/v1/executor/uniproc_executor.py
- vllm/v1/worker/gpu_worker.py
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
# test with torchrun tp=2 and dp=4 with ep
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
- label: Distributed Tests (4 GPUs)(A100)
gpu: a100
optional: true
num_gpus: 4
source_file_dependencies:
- vllm/
commands:
# NOTE: don't test llama model here, it seems hf implementation is buggy
# see https://github.com/vllm-project/vllm/pull/5689 for details
- pytest -v -s distributed/test_custom_all_reduce.py
- torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
- pytest -v -s -x lora/test_mixtral.py
- label: Distributed Tests (2 GPUs)(H200)
gpu: h200
optional: true
working_dir: "/vllm-workspace/"
num_gpus: 2
commands:
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
- pytest -v -s tests/v1/distributed/test_dbo.py
- label: Distributed Tests (2 GPUs)(B200)
gpu: b200
optional: true
working_dir: "/vllm-workspace/"
num_gpus: 2
commands:
- pytest -v -s tests/distributed/test_context_parallel.py
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
- pytest -v -s tests/v1/distributed/test_dbo.py
- label: 2 Node Test (4 GPUs)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_gpus: 2
num_nodes: 2
source_file_dependencies:
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- vllm/model_executor/models/
- tests/distributed/
- tests/examples/offline_inference/data_parallel.py
commands:
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
- label: Distributed NixlConnector PD accuracy (4 GPUs)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
- label: Pipeline + Context Parallelism (4 GPUs))
timeout_in_minutes: 60
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- vllm/model_executor/models/
- tests/distributed/
commands:
- pytest -v -s distributed/test_pp_cudagraph.py
- pytest -v -s distributed/test_pipeline_parallel.py

View File

@@ -1,59 +0,0 @@
group: E2E Integration
depends_on:
- image-build
steps:
- label: DeepSeek V2-Lite Accuracy
timeout_in_minutes: 60
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
- label: Qwen3-30B-A3B-FP8-block Accuracy
timeout_in_minutes: 60
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
timeout_in_minutes: 60
gpu: b200
optional: true
num_gpus: 2
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
- label: Prime-RL Integration (2 GPUs)
timeout_in_minutes: 30
optional: true
num_gpus: 2
working_dir: "/vllm-workspace"
source_file_dependencies:
- vllm/
- .buildkite/scripts/run-prime-rl-test.sh
commands:
- bash .buildkite/scripts/run-prime-rl-test.sh
- label: DeepSeek V2-Lite Async EPLB Accuracy
timeout_in_minutes: 60
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
timeout_in_minutes: 60
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040

View File

@@ -1,26 +0,0 @@
group: Engine
depends_on:
- image-build
steps:
- label: Engine
timeout_in_minutes: 15
source_file_dependencies:
- vllm/
- tests/engine
- tests/test_sequence
- tests/test_config
- tests/test_logger
- tests/test_vllm_port
commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
- label: V1 e2e + engine
timeout_in_minutes: 45
source_file_dependencies:
- vllm/
- tests/v1
commands:
# TODO: accuracy does not match, whether setting
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- pytest -v -s v1/e2e
- pytest -v -s v1/engine

View File

@@ -1,68 +0,0 @@
group: Entrypoints
depends_on:
- image-build
steps:
- label: Entrypoints Unit Tests
timeout_in_minutes: 10
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/entrypoints
- tests/entrypoints/
commands:
- pytest -v -s entrypoints/openai/tool_parsers
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
- label: Entrypoints Integration (LLM)
timeout_in_minutes: 40
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/
- tests/entrypoints/llm
- tests/entrypoints/offline_mode
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- label: Entrypoints Integration (API Server)
timeout_in_minutes: 130
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/
- tests/entrypoints/openai
- tests/entrypoints/test_chat_utils
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
- pytest -v -s entrypoints/test_chat_utils.py
- label: Entrypoints Integration (Pooling)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/
- tests/entrypoints/pooling
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/pooling
- label: Entrypoints V1
timeout_in_minutes: 50
source_file_dependencies:
- vllm/
- tests/v1
commands:
- pytest -v -s v1/entrypoints
- label: OpenAI API Correctness
timeout_in_minutes: 30
source_file_dependencies:
- csrc/
- vllm/entrypoints/openai/
- vllm/model_executor/models/whisper.py
commands: # LMEval+Transcription WER check
- pytest -s entrypoints/openai/correctness/

View File

@@ -1,23 +0,0 @@
group: Expert Parallelism
depends_on:
- image-build
steps:
- label: EPLB Algorithm
timeout_in_minutes: 15
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/distributed/eplb
- tests/distributed/test_eplb_algo.py
commands:
- pytest -v -s distributed/test_eplb_algo.py
- label: EPLB Execution
timeout_in_minutes: 20
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
- vllm/distributed/eplb
- tests/distributed/test_eplb_execute.py
commands:
- pytest -v -s distributed/test_eplb_execute.py
- pytest -v -s distributed/test_eplb_spec_decode.py

View File

@@ -1,117 +0,0 @@
group: Kernels
depends_on:
- image-build
steps:
- label: Kernels Core Operation Test
timeout_in_minutes: 75
source_file_dependencies:
- csrc/
- tests/kernels/core
- tests/kernels/test_top_k_per_row.py
commands:
- pytest -v -s kernels/core kernels/test_top_k_per_row.py
- label: Kernels Attention Test %N
timeout_in_minutes: 35
source_file_dependencies:
- csrc/attention/
- vllm/attention
- vllm/v1/attention
- tests/kernels/attention
commands:
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
- label: Kernels Quantization Test %N
timeout_in_minutes: 90
source_file_dependencies:
- csrc/quantization/
- vllm/model_executor/layers/quantization
- tests/kernels/quantization
commands:
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
- label: Kernels MoE Test %N
timeout_in_minutes: 60
source_file_dependencies:
- csrc/quantization/cutlass_w8a8/moe/
- csrc/moe/
- tests/kernels/moe
- vllm/model_executor/layers/fused_moe/
- vllm/distributed/device_communicators/
- vllm/envs.py
- vllm/config
commands:
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
- label: Kernels Mamba Test
timeout_in_minutes: 45
source_file_dependencies:
- csrc/mamba/
- tests/kernels/mamba
- vllm/model_executor/layers/mamba/ops
commands:
- pytest -v -s kernels/mamba
- label: Kernels DeepGEMM Test (H100)
timeout_in_minutes: 45
gpu: h100
num_gpus: 1
source_file_dependencies:
- tools/install_deepgemm.sh
- vllm/utils/deep_gemm.py
- vllm/model_executor/layers/fused_moe
- vllm/model_executor/layers/quantization
- tests/kernels/quantization/test_block_fp8.py
- tests/kernels/moe/test_deepgemm.py
- tests/kernels/moe/test_batched_deepgemm.py
- tests/kernels/attention/test_deepgemm_attention.py
commands:
- pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
- pytest -v -s kernels/moe/test_deepgemm.py
- pytest -v -s kernels/moe/test_batched_deepgemm.py
- pytest -v -s kernels/attention/test_deepgemm_attention.py
- label: Kernels (B200)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/"
gpu: b200
# optional: true
source_file_dependencies:
- csrc/quantization/fp4/
- csrc/attention/mla/
- csrc/quantization/cutlass_w8a8/moe/
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/v1/attention/backends/mla/cutlass_mla.py
- vllm/v1/attention/backends/mla/flashinfer_mla.py
- vllm/platforms/cuda.py
- vllm/attention/selector.py
commands:
- nvidia-smi
- python3 examples/offline_inference/basic/chat.py
# Attention
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
- pytest -v -s tests/kernels/attention/test_attention_selector.py
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
- pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
# Quantization
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
- pytest -v -s tests/kernels/moe/test_flashinfer.py
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py

View File

@@ -1,46 +0,0 @@
group: LM Eval
depends_on:
- image-build
steps:
- label: LM Eval Small Models
timeout_in_minutes: 75
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
autorun_on_main: true
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
- label: LM Eval Large Models (4 GPUs)(A100)
gpu: a100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
- label: LM Eval Large Models (4 GPUs)(H100)
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
- label: LM Eval Small Models (B200)
timeout_in_minutes: 120
gpu: b200
optional: true
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1

View File

@@ -1,31 +0,0 @@
group: LoRA
depends_on:
- image-build
steps:
- label: LoRA %N
timeout_in_minutes: 30
source_file_dependencies:
- vllm/lora
- tests/lora
commands:
- pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py
parallelism: 4
- label: LoRA TP (Distributed)
timeout_in_minutes: 30
num_gpus: 4
source_file_dependencies:
- vllm/lora
- tests/lora
commands:
# FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
# There is some Tensor Parallelism related processing logic in LoRA that
# requires multi-GPU testing for validation.
- pytest -v -s -x lora/test_chatglm3_tp.py
- pytest -v -s -x lora/test_llama_tp.py
- pytest -v -s -x lora/test_llm_with_multi_loras.py
- pytest -v -s -x lora/test_olmoe_tp.py
- pytest -v -s -x lora/test_gptoss_tp.py

View File

@@ -1,165 +0,0 @@
group: Miscellaneous
depends_on:
- image-build
steps:
- label: V1 Others
timeout_in_minutes: 60
source_file_dependencies:
- vllm/
- tests/v1
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
# split the test to avoid interference
- pytest -v -s -m 'not cpu_test' v1/core
- pytest -v -s v1/executor
- pytest -v -s v1/kv_offload
- pytest -v -s v1/sample
- pytest -v -s v1/logits_processors
- pytest -v -s v1/worker
- pytest -v -s v1/spec_decode
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
- pytest -v -s -m 'not cpu_test' v1/metrics
- pytest -v -s v1/test_oracle.py
- pytest -v -s v1/test_request.py
- pytest -v -s v1/test_outputs.py
# Integration test for streaming correctness (requires special branch).
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
- label: V1 Others (CPU)
depends_on: ~
source_file_dependencies:
- vllm/
- tests/v1
no_gpu: true
commands:
# split the test to avoid interference
- pytest -v -s -m 'cpu_test' v1/core
- pytest -v -s v1/structured_output
- pytest -v -s v1/test_serial_utils.py
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
- pytest -v -s -m 'cpu_test' v1/metrics
- label: Regression
timeout_in_minutes: 20
source_file_dependencies:
- vllm/
- tests/test_regression
commands:
- pip install modelscope
- pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional
- label: Examples
timeout_in_minutes: 45
working_dir: "/vllm-workspace/examples"
source_file_dependencies:
- vllm/entrypoints
- vllm/multimodal
- examples/
commands:
- pip install tensorizer # for tensorizer test
- python3 offline_inference/basic/chat.py # for basic
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py
# for multi-modal models
- python3 offline_inference/audio_language.py --seed 0
- python3 offline_inference/vision_language.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
# for pooling models
- python3 pooling/pooling/vision_language_pooling.py --seed 0
# for features demo
- python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
- label: Metrics, Tracing (2 GPUs)
timeout_in_minutes: 20
num_gpus: 2
source_file_dependencies:
- vllm/
- tests/v1/tracing
commands:
- "pip install \
'opentelemetry-sdk>=1.26.0' \
'opentelemetry-api>=1.26.0' \
'opentelemetry-exporter-otlp>=1.26.0' \
'opentelemetry-semantic-conventions-ai>=0.4.1'"
- pytest -v -s v1/tracing
- label: Python-only Installation
depends_on: ~
timeout_in_minutes: 20
source_file_dependencies:
- tests/standalone_tests/python_only_compile.sh
- setup.py
commands:
- bash standalone_tests/python_only_compile.sh
- label: Async Engine, Inputs, Utils, Worker
timeout_in_minutes: 50
source_file_dependencies:
- vllm/
- tests/multimodal
- tests/utils_
commands:
- pytest -v -s -m 'not cpu_test' multimodal
- pytest -v -s utils_
- label: Async Engine, Inputs, Utils, Worker, Config (CPU)
depends_on: ~
timeout_in_minutes: 30
source_file_dependencies:
- vllm/
- tests/test_inputs.py
- tests/test_outputs.py
- tests/multimodal
- tests/standalone_tests/lazy_imports.py
- tests/tokenizers_
- tests/tool_parsers
- tests/transformers_utils
- tests/config
no_gpu: true
commands:
- python3 standalone_tests/lazy_imports.py
- pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py
- pytest -v -s -m 'cpu_test' multimodal
- pytest -v -s tokenizers_
- pytest -v -s tool_parsers
- pytest -v -s transformers_utils
- pytest -v -s config
- label: GPT-OSS Eval (B200)
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
gpu: b200
optional: true
source_file_dependencies:
- tests/evals/gpt_oss
- vllm/model_executor/models/gpt_oss.py
- vllm/model_executor/layers/quantization/mxfp4.py
- vllm/v1/attention/backends/flashinfer.py
commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
- label: Batch Invariance (H100)
timeout_in_minutes: 25
gpu: h100
source_file_dependencies:
- vllm/v1/attention
- vllm/model_executor/layers
- tests/v1/determinism/
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pip install pytest-timeout pytest-forked
- pytest -v -s v1/determinism/test_batch_invariance.py
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py

View File

@@ -1,17 +0,0 @@
group: Model Executor
depends_on:
- image-build
steps:
- label: Model Executor
timeout_in_minutes: 35
source_file_dependencies:
- vllm/engine/arg_utils.py
- vllm/config/model.py
- vllm/model_executor
- tests/model_executor
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
commands:
- apt-get update && apt-get install -y curl libsodium23
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s model_executor
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py

View File

@@ -1,62 +0,0 @@
group: Models - Basic
depends_on:
- image-build
steps:
- label: Basic Models Tests (Initialization)
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/models/test_initialization.py
commands:
# Run a subset of model initialization tests
- pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
- label: Basic Models Tests (Extra Initialization) %N
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/model_executor/models/
- tests/models/test_initialization.py
commands:
# Only when vLLM model source is modified - test initialization of a large
# subset of supported models (the complement of the small subset in the above
# test.) Also run if model initialization test file is modified
- pytest -v -s models/test_initialization.py -k 'not test_can_initialize_small_subset' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
parallelism: 2
- label: Basic Models Tests (Other)
timeout_in_minutes: 45
source_file_dependencies:
- vllm/
- tests/models/test_transformers.py
- tests/models/test_registry.py
commands:
- pytest -v -s models/test_transformers.py models/test_registry.py
- label: Basic Models Test (Other CPU) # 5min
timeout_in_minutes: 10
source_file_dependencies:
- vllm/
- tests/models/test_utils.py
- tests/models/test_vision.py
no_gpu: true
commands:
- pytest -v -s models/test_utils.py models/test_vision.py
- label: Transformers Nightly Models
working_dir: "/vllm-workspace/"
optional: true
soft_fail: true
commands:
- pip install --upgrade git+https://github.com/huggingface/transformers
- pytest -v -s tests/models/test_initialization.py
- pytest -v -s tests/models/test_transformers.py
- pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py
- python3 examples/offline_inference/basic/chat.py
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
# Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper

View File

@@ -1,22 +0,0 @@
group: Models - Distributed
depends_on:
- image-build
steps:
- label: Distributed Model Tests (2 GPUs)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/tests"
num_gpus: 2
source_file_dependencies:
- vllm/model_executor/model_loader/sharded_state_loader.py
- vllm/model_executor/models/
- tests/basic_correctness/
- tests/model_executor/model_loader/test_sharded_state_loader.py
- tests/models/
commands:
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
# Avoid importing model tests that cause CUDA reinitialization error
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
- VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'

View File

@@ -1,91 +0,0 @@
group: Models - Language
depends_on:
- image-build
steps:
- label: Language Models Tests (Standard)
timeout_in_minutes: 25
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/models/language
commands:
# Test standard language models, excluding a subset of slow tests
- pip freeze | grep -E 'torch'
- pytest -v -s models/language -m 'core_model and (not slow_test)'
- label: Language Models Tests (Extra Standard) %N
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/model_executor/models/
- tests/models/language/pooling/test_embedding.py
- tests/models/language/generation/test_common.py
- tests/models/language/pooling/test_classification.py
commands:
# Shard slow subset of standard language models tests. Only run when model
# source is modified, or when specified test files are modified
- pip freeze | grep -E 'torch'
- pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
parallelism: 2
- label: Language Models Tests (Hybrid) %N
timeout_in_minutes: 75
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/models/language/generation
commands:
# Install fast path packages for testing against transformers
# Note: also needed to run plamo2 model in vLLM
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
# Shard hybrid language model tests
- pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
parallelism: 2
- label: Language Models Test (Extended Generation) # 80min
timeout_in_minutes: 110
mirror_hardwares: [amdexperimental]
optional: true
source_file_dependencies:
- vllm/
- tests/models/language/generation
commands:
# Install fast path packages for testing against transformers
# Note: also needed to run plamo2 model in vLLM
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
- label: Language Models Test (PPL)
timeout_in_minutes: 110
mirror_hardwares: [amdexperimental]
optional: true
source_file_dependencies:
- vllm/
- tests/models/language/generation_ppl_test
commands:
- pytest -v -s models/language/generation_ppl_test
- label: Language Models Test (Extended Pooling) # 36min
timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
optional: true
source_file_dependencies:
- vllm/
- tests/models/language/pooling
commands:
- pytest -v -s models/language/pooling -m 'not core_model'
- label: Language Models Test (MTEB)
timeout_in_minutes: 110
mirror_hardwares: [amdexperimental]
optional: true
source_file_dependencies:
- vllm/
- tests/models/language/pooling_mteb_test
commands:
- pytest -v -s models/language/pooling_mteb_test

View File

@@ -1,79 +0,0 @@
group: Models - Multimodal
depends_on:
- image-build
steps:
- label: Multi-Modal Models (Standard) # 60min
timeout_in_minutes: 80
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pip freeze | grep -E 'torch'
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- label: Multi-Modal Processor Test (CPU)
timeout_in_minutes: 60
source_file_dependencies:
- vllm/
- tests/models/multimodal
no_gpu: true
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
- label: Multi-Modal Processor # 44min
timeout_in_minutes: 60
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/processing/test_tensor_schema.py
- label: Multi-Modal Accuracy Eval (Small Models) # 50min
timeout_in_minutes: 70
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- vllm/multimodal/
- vllm/inputs/
- vllm/v1/core/
commands:
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
- label: Multi-Modal Models (Extended) 1
optional: true
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
- label: Multi-Modal Models (Extended) 2
optional: true
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
- label: Multi-Modal Models (Extended) 3
optional: true
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
# This test is used only in PR development phase to test individual models and should never run on main
- label: Custom Models
optional: true
commands:
- echo 'Testing custom models...'
# PR authors can temporarily add commands below to test individual models
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*

View File

@@ -1,34 +0,0 @@
group: Plugins
depends_on:
- image-build
steps:
- label: Plugin Tests (2 GPUs)
timeout_in_minutes: 60
working_dir: "/vllm-workspace/tests"
num_gpus: 2
source_file_dependencies:
- vllm/plugins/
- tests/plugins/
commands:
# begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
- pip install -e ./plugins/vllm_add_dummy_platform
- pytest -v -s plugins_tests/test_platform_plugins.py
- pip uninstall vllm_add_dummy_platform -y
# end platform plugin tests
# begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
- pip install -e ./plugins/prithvi_io_processor_plugin
- pytest -v -s plugins_tests/test_io_processor_plugins.py
- pip uninstall prithvi_io_processor_plugin -y
# end io_processor plugins test
# begin stat_logger plugins test
- pip install -e ./plugins/vllm_add_dummy_stat_logger
- pytest -v -s plugins_tests/test_stats_logger_plugins.py
- pip uninstall dummy_stat_logger -y
# end stat_logger plugins test
# other tests continue here:
- pytest -v -s plugins_tests/test_scheduler_plugins.py
- pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s distributed/test_distributed_oot.py
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
- pytest -v -s models/test_oot_registration.py # it needs a clean process
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins

View File

@@ -1,50 +0,0 @@
group: PyTorch
depends_on:
- image-build
steps:
- label: PyTorch Compilation Unit Tests
timeout_in_minutes: 30
source_file_dependencies:
- vllm/
- tests/compile
commands:
# Run unit tests defined directly under compile/,
# not including subdirectories, which are usually heavier
# tests covered elsewhere.
# Use `find` to launch multiple instances of pytest so that
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
- "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\;"
- label: PyTorch Fullgraph Smoke Test
timeout_in_minutes: 30
source_file_dependencies:
- vllm/
- tests/compile
commands:
# Run smoke tests under fullgraph directory, except test_full_graph.py
# as it is a heavy test that is covered in other steps.
# Use `find` to launch multiple instances of pytest so that
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"
- label: PyTorch Fullgraph
timeout_in_minutes: 40
source_file_dependencies:
- vllm/
- tests/compile
commands:
# fp8 kv scales not supported on sm89, tested on Blackwell instead
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
# Limit to no custom ops to reduce running time
# Wrap with quotes to escape yaml and avoid starting -k string with a -
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
- label: Pytorch Nightly Dependency Override Check # 2min
# if this test fails, it means the nightly torch version is not compatible with some
# of the dependencies. Please check the error message and add the package to whitelist
# in /vllm/tools/pre_commit/generate_nightly_torch_test.py
soft_fail: true
source_file_dependencies:
- requirements/nightly_torch_test.txt
commands:
- bash standalone_tests/pytorch_nightly_dependency.sh

View File

@@ -1,46 +0,0 @@
group: Quantization
depends_on:
- image-build
steps:
- label: Quantization
timeout_in_minutes: 90
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
- tests/quantization
commands:
# temporary install here since we need nightly, will move to requirements/test.in
# after torchao 0.12 release, and pin a working version of torchao nightly here
# since torchao nightly is only compatible with torch nightly currently
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
# we can only upgrade after this is resolved
# TODO(jerryzh168): resolve the above comment
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
- uv pip install --system conch-triton-kernels
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
- label: Quantized MoE Test (B200)
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
gpu: b200
source_file_dependencies:
- tests/quantization/test_blackwell_moe.py
- vllm/model_executor/models/deepseek_v2.py
- vllm/model_executor/models/gpt_oss.py
- vllm/model_executor/models/llama4.py
- vllm/model_executor/layers/fused_moe
- vllm/model_executor/layers/quantization/compressed_tensors
- vllm/model_executor/layers/quantization/modelopt.py
- vllm/model_executor/layers/quantization/mxfp4.py
- vllm/v1/attention/backends/flashinfer.py
commands:
- pytest -s -v tests/quantization/test_blackwell_moe.py
- label: Quantized Models Test
timeout_in_minutes: 60
source_file_dependencies:
- vllm/model_executor/layers/quantization
- tests/models/quantization
commands:
- pytest -v -s models/quantization

View File

@@ -1,14 +0,0 @@
group: Samplers
depends_on:
- image-build
steps:
- label: Samplers Test
timeout_in_minutes: 75
source_file_dependencies:
- vllm/model_executor/layers
- vllm/sampling_metadata.py
- tests/samplers
- tests/conftest.py
commands:
- pytest -v -s samplers
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers

View File

@@ -1,13 +0,0 @@
group: Tool use
depends_on:
- image-build
steps:
- label: OpenAI-Compatible Tool Use
timeout_in_minutes: 35
mirror_hardwares: [amdexperimental]
fast_check: false
source_file_dependencies:
- vllm/
- tests/tool_use
commands:
- pytest -v -s tool_use

View File

@@ -1,25 +0,0 @@
group: Weight Loading
depends_on:
- image-build
steps:
- label: Weight Loading Multiple GPU # 33min
timeout_in_minutes: 45
working_dir: "/vllm-workspace/tests"
num_gpus: 2
optional: true
source_file_dependencies:
- vllm/
- tests/weight_loading
commands:
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
- label: Weight Loading Multiple GPU - Large Models # optional
working_dir: "/vllm-workspace/tests"
num_gpus: 2
gpu: a100
optional: true
source_file_dependencies:
- vllm/
- tests/weight_loading
commands:
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt

8
.github/CODEOWNERS vendored
View File

@@ -9,7 +9,6 @@
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
/vllm/model_executor/layers/mamba @tdoublep /vllm/model_executor/layers/mamba @tdoublep
/vllm/model_executor/model_loader @22quinn /vllm/model_executor/model_loader @22quinn
/vllm/model_executor/layers/batch_invariant.py @yewentao256
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
/vllm/vllm_flash_attn @LucasWilkinson /vllm/vllm_flash_attn @LucasWilkinson
/vllm/lora @jeejeelee /vllm/lora @jeejeelee
@@ -36,9 +35,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/vllm/v1/kv_cache_interface.py @heheda12345 /vllm/v1/kv_cache_interface.py @heheda12345
/vllm/v1/offloading @ApostaC /vllm/v1/offloading @ApostaC
# Model runner V2
/vllm/v1/worker/gpu @WoosukKwon
# Test ownership # Test ownership
/.buildkite/lm-eval-harness @mgoin /.buildkite/lm-eval-harness @mgoin
/tests/distributed/test_multi_node_assignment.py @youkaichao /tests/distributed/test_multi_node_assignment.py @youkaichao
@@ -60,7 +56,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/tests/v1/kv_connector/nixl_integration @NickLucche /tests/v1/kv_connector/nixl_integration @NickLucche
/tests/v1/kv_connector @ApostaC /tests/v1/kv_connector @ApostaC
/tests/v1/offloading @ApostaC /tests/v1/offloading @ApostaC
/tests/v1/determinism @yewentao256
# Transformers modeling backend # Transformers modeling backend
/vllm/model_executor/models/transformers @hmellor /vllm/model_executor/models/transformers @hmellor
@@ -146,10 +141,9 @@ mkdocs.yaml @hmellor
/requirements/kv_connectors.txt @NickLucche /requirements/kv_connectors.txt @NickLucche
# Pooling models # Pooling models
/examples/pooling @noooop /examples/*/pooling/ @noooop
/tests/models/*/pooling* @noooop /tests/models/*/pooling* @noooop
/tests/entrypoints/pooling @noooop /tests/entrypoints/pooling @noooop
/vllm/entrypoints/pooling @noooop
/vllm/config/pooler.py @noooop /vllm/config/pooler.py @noooop
/vllm/pooling_params.py @noooop /vllm/pooling_params.py @noooop
/vllm/model_executor/layers/pooler.py @noooop /vllm/model_executor/layers/pooler.py @noooop

48
.github/mergify.yml vendored
View File

@@ -14,52 +14,6 @@ pull_request_rules:
comment: comment:
message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/" message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
- name: comment-pre-commit-failure
description: Comment on PR when pre-commit check fails
conditions:
- status-failure=pre-commit
- -closed
- -draft
actions:
comment:
message: |
Hi @{{author}}, the pre-commit checks have failed. Please run:
```bash
uv pip install pre-commit
pre-commit install
pre-commit run --all-files
```
Then, commit the changes and push to your branch.
For future commits, `pre-commit` will run automatically on changed files before each commit.
> [!TIP]
> <details>
> <summary>Is <code>mypy</code> or <code>markdownlint</code> failing?</summary>
> <br/>
> <code>mypy</code> and <code>markdownlint</code> are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally:
>
> ```bash
> # For mypy (substitute "3.10" with the failing version if needed)
> pre-commit run --hook-stage manual mypy-3.10
> # For markdownlint
> pre-commit run --hook-stage manual markdownlint
> ```
> </details>
- name: comment-dco-failure
description: Comment on PR when DCO check fails
conditions:
- status-failure=dco
- -closed
- -draft
actions:
comment:
message: |
Hi @{{author}}, the DCO check has failed. Please click on DCO in the Checks section for instructions on how to resolve this.
- name: label-ci-build - name: label-ci-build
description: Automatically apply ci/build label description: Automatically apply ci/build label
conditions: conditions:
@@ -186,7 +140,7 @@ pull_request_rules:
- files~=^tests/entrypoints/test_context.py - files~=^tests/entrypoints/test_context.py
- files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py - files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
- files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py - files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
- files~=^vllm/entrypoints/openai/parser/harmony_utils.py - files~=^vllm/entrypoints/harmony_utils.py
- files~=^vllm/entrypoints/tool_server.py - files~=^vllm/entrypoints/tool_server.py
- files~=^vllm/entrypoints/tool.py - files~=^vllm/entrypoints/tool.py
- files~=^vllm/entrypoints/context.py - files~=^vllm/entrypoints/context.py

View File

@@ -13,10 +13,10 @@ jobs:
steps: steps:
- name: Checkout repository - name: Checkout repository
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python - name: Set up Python
uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
with: with:
python-version: '3.12' python-version: '3.12'

View File

@@ -105,31 +105,6 @@ jobs:
} }
], ],
}, },
cpu: {
// Keyword search - matches whole words only (with word boundaries)
keywords: [
{
term: "CPU Backend",
searchIn: "title"
},
{
term: "x86",
searchIn: "title"
},
{
term: "ARM",
searchIn: "title"
},
{
term: "Apple Silicon",
searchIn: "title"
},
{
term: "IBM Z",
searchIn: "title"
},
],
},
// Add more label configurations here as needed // Add more label configurations here as needed
// example: { // example: {
// keywords: [...], // keywords: [...],

View File

@@ -9,10 +9,10 @@ on:
jobs: jobs:
macos-m1-smoke-test: macos-m1-smoke-test:
runs-on: macos-latest runs-on: macos-latest
timeout-minutes: 30 timeout-minutes: 20
steps: steps:
- uses: actions/checkout@v6.0.1 - uses: actions/checkout@v4
- uses: astral-sh/setup-uv@v7 - uses: astral-sh/setup-uv@v7
with: with:
@@ -37,14 +37,15 @@ jobs:
- name: Verify installation - name: Verify installation
run: | run: |
python -c "import vllm; print(f'vLLM version: {vllm.__version__}')" python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
python -c "import torch; print(f'PyTorch: {torch.__version__}')"
- name: Smoke test vllm serve - name: Smoke test vllm serve
timeout-minutes: 10
run: | run: |
# Start server in background # Start server in background
vllm serve Qwen/Qwen3-0.6B \ vllm serve Qwen/Qwen3-0.6B \
--max-model-len=2K \ --max-model-len=2048 \
--load-format=dummy \ --load-format=dummy \
--hf-overrides '{"num_hidden_layers": 2}' \
--enforce-eager \ --enforce-eager \
--port 8000 & --port 8000 &

View File

@@ -16,8 +16,8 @@ jobs:
pre-commit: pre-commit:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
with: with:
python-version: "3.12" python-version: "3.12"
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json" - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"

View File

@@ -7,15 +7,13 @@ on:
jobs: jobs:
close-issues-and-pull-requests: close-issues-and-pull-requests:
# Prevents triggering on forks or other repos
if: github.repository == 'vllm-project/vllm'
permissions: permissions:
issues: write issues: write
pull-requests: write pull-requests: write
actions: write actions: write
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1 - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
with: with:
# Increasing this value ensures that changes to this workflow # Increasing this value ensures that changes to this workflow
# propagate to all issues and PRs in days rather than months # propagate to all issues and PRs in days rather than months

3
.gitignore vendored
View File

@@ -4,9 +4,6 @@
# vllm-flash-attn built from source # vllm-flash-attn built from source
vllm/vllm_flash_attn/* vllm/vllm_flash_attn/*
# OpenAI triton kernels copied from source
vllm/third_party/triton_kernels/*
# triton jit # triton jit
.triton .triton

View File

@@ -136,7 +136,7 @@ elseif(HIP_FOUND)
# ROCm 5.X and 6.X # ROCm 5.X and 6.X
if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
Torch_VERSION VERSION_LESS ${TORCH_SUPPORTED_VERSION_ROCM}) NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} " message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
"expected for ROCm build, saw ${Torch_VERSION} instead.") "expected for ROCm build, saw ${Torch_VERSION} instead.")
endif() endif()
@@ -307,7 +307,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library") SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building. # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
set(CUTLASS_REVISION "v4.2.1") set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use")
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR}) if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@@ -354,17 +354,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# Only build Marlin kernels if we are building for at least some compatible archs. # Only build Marlin kernels if we are building for at least some compatible archs.
# Keep building Marlin for 9.0 as there are some group sizes and shapes that # Keep building Marlin for 9.0 as there are some group sizes and shapes that
# are not supported by Machete yet. # are not supported by Machete yet.
# 9.0 for latest bf16 atomicAdd PTX
# marlin arches for fp16 output cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
# marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX)
cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
# marlin arches for fp8 input
# - sm80 doesn't support fp8 computation
# - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
# so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0)
cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
if (MARLIN_ARCHS) if (MARLIN_ARCHS)
# #
@@ -374,18 +365,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set(MARLIN_GEN_SCRIPT set(MARLIN_GEN_SCRIPT
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py) ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH) file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}") message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}") message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH} if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}) OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
execute_process( execute_process(
COMMAND ${CMAKE_COMMAND} -E env COMMAND ${CMAKE_COMMAND} -E env
PYTHONPATH=$ENV{PYTHONPATH} PYTHONPATH=$PYTHONPATH
${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR} ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
RESULT_VARIABLE marlin_generation_result RESULT_VARIABLE marlin_generation_result
OUTPUT_VARIABLE marlin_generation_result OUTPUT_VARIABLE marlin_generation_result
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
@@ -398,15 +387,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"\nCheck the log for details: " "\nCheck the log for details: "
"${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log") "${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
else() else()
set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH} set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
CACHE STRING "Last run Marlin generate script hash and arch" FORCE) CACHE STRING "Last run Marlin generate script hash" FORCE)
message(STATUS "Marlin generation completed successfully.") message(STATUS "Marlin generation completed successfully.")
endif() endif()
else() else()
message(STATUS "Marlin generation script has not changed, skipping generation.") message(STATUS "Marlin generation script has not changed, skipping generation.")
endif() endif()
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu") file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}" SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
CUDA_ARCHS "${MARLIN_ARCHS}") CUDA_ARCHS "${MARLIN_ARCHS}")
@@ -414,34 +403,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC} set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
endif() endif()
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC}) list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
set_source_files_properties(${MARLIN_TEMPLATE_BF16_KERNEL_SRC}
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
endif()
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
if (MARLIN_FP8_ARCHS)
file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
CUDA_ARCHS "${MARLIN_FP8_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
set_source_files_properties(${MARLIN_TEMPLATE_FP8_KERNEL_SRC}
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
endif()
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_FP8_KERNEL_SRC})
endif()
set(MARLIN_SRCS set(MARLIN_SRCS
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu" "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
"csrc/quantization/gptq_marlin/gptq_marlin.cu" "csrc/quantization/gptq_marlin/gptq_marlin.cu"
"csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu"
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu" "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu") "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
@@ -637,15 +604,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set(SRCS set(SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu" "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
"csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu" "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
"csrc/quantization/fp4/nvfp4_experts_quant.cu" "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
"csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu"
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${SRCS}" SRCS "${SRCS}"
CUDA_ARCHS "${FP4_ARCHS}") CUDA_ARCHS "${FP4_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}") list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1") list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM120=1")
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}") message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
else() else()
message(STATUS "Not building NVFP4 as no compatible archs were found.") message(STATUS "Not building NVFP4 as no compatible archs were found.")
@@ -822,7 +786,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH}) OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
execute_process( execute_process(
COMMAND ${CMAKE_COMMAND} -E env COMMAND ${CMAKE_COMMAND} -E env
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$ENV{PYTHONPATH} PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT} ${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
RESULT_VARIABLE machete_generation_result RESULT_VARIABLE machete_generation_result
OUTPUT_VARIABLE machete_generation_output OUTPUT_VARIABLE machete_generation_output
@@ -874,10 +838,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}") cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
set(SRCS set(SRCS
"csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu" "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu")
"csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu"
"csrc/quantization/cutlass_w4a8/w4a8_utils.cu"
)
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${SRCS}" SRCS "${SRCS}"
@@ -947,6 +908,7 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
set(VLLM_MOE_EXT_SRC set(VLLM_MOE_EXT_SRC
"csrc/moe/torch_bindings.cpp" "csrc/moe/torch_bindings.cpp"
"csrc/moe/moe_align_sum_kernels.cu" "csrc/moe/moe_align_sum_kernels.cu"
"csrc/moe/moe_lora_align_sum_kernels.cu"
"csrc/moe/topk_softmax_kernels.cu") "csrc/moe/topk_softmax_kernels.cu")
if(VLLM_GPU_LANG STREQUAL "CUDA") if(VLLM_GPU_LANG STREQUAL "CUDA")
@@ -976,15 +938,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
CUDA_ARCHS "${CUDA_ARCHS}") CUDA_ARCHS "${CUDA_ARCHS}")
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}") list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
# moe marlin arches # 9.0 for latest bf16 atomicAdd PTX
# note that we always set `use_atomic_add=False` for moe marlin now, cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
# so we don't need 9.0 for bf16 atomicAdd PTX
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
# moe marlin arches for fp8 input
# - sm80 doesn't support fp8 computation
# - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
# so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0)
cuda_archs_loose_intersection(MARLIN_MOE_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
if (MARLIN_MOE_ARCHS) if (MARLIN_MOE_ARCHS)
# #
@@ -994,18 +949,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set(MOE_MARLIN_GEN_SCRIPT set(MOE_MARLIN_GEN_SCRIPT
${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py) ${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH) file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MOE_MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
message(STATUS "Marlin MOE generation script hash with arch: ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}") message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
message(STATUS "Last run Marlin MOE generate script hash with arch: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}") message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH} if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}) OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
execute_process( execute_process(
COMMAND ${CMAKE_COMMAND} -E env COMMAND ${CMAKE_COMMAND} -E env
PYTHONPATH=$ENV{PYTHONPATH} PYTHONPATH=$PYTHONPATH
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR} ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
RESULT_VARIABLE moe_marlin_generation_result RESULT_VARIABLE moe_marlin_generation_result
OUTPUT_VARIABLE moe_marlin_generation_output OUTPUT_VARIABLE moe_marlin_generation_output
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
@@ -1018,7 +971,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"\nCheck the log for details: " "\nCheck the log for details: "
"${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log") "${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
else() else()
set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH} set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
CACHE STRING "Last run Marlin MOE generate script hash" FORCE) CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
message(STATUS "Marlin MOE generation completed successfully.") message(STATUS "Marlin MOE generation completed successfully.")
endif() endif()
@@ -1026,28 +979,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Marlin MOE generation script has not changed, skipping generation.") message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
endif() endif()
file(GLOB MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu") file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
list(APPEND MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/ops.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${MARLIN_MOE_SRC}" SRCS "${MOE_WNAA16_MARLIN_SRC}"
CUDA_ARCHS "${MARLIN_MOE_ARCHS}") CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
set_source_files_properties(${MARLIN_MOE_SRC} set_source_files_properties(${MOE_WNAA16_MARLIN_SRC}
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
endif() endif()
list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
if (MARLIN_MOE_FP8_ARCHS) list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
file(GLOB MARLIN_MOE_FP8_SRC "csrc/moe/marlin_moe_wna16/sm89_kernel_*.cu")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_MOE_FP8_SRC}"
CUDA_ARCHS "${MARLIN_MOE_FP8_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
set_source_files_properties(${MARLIN_MOE_FP8_SRC}
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
endif()
list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_FP8_SRC})
endif()
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}") message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
else() else()
@@ -1089,11 +1030,6 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
WITH_SOABI) WITH_SOABI)
endif() endif()
# For CUDA and HIP builds also build the triton_kernels external package.
if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
include(cmake/external_projects/triton_kernels.cmake)
endif()
# For CUDA we also build and ship some external projects. # For CUDA we also build and ship some external projects.
if (VLLM_GPU_LANG STREQUAL "CUDA") if (VLLM_GPU_LANG STREQUAL "CUDA")
include(cmake/external_projects/flashmla.cmake) include(cmake/external_projects/flashmla.cmake)

View File

@@ -21,7 +21,6 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
*Latest News* 🔥 *Latest News* 🔥
- [2025/11] We hosted [vLLM Bangkok Meetup](https://luma.com/v0f647nv). We explored vLLM and LMCache inference and low-resource language adaptation with speakers from Embedded LLM, AMD, and Red Hat. Please find the meetup slides [here](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing).
- [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI) - [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
- [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link). - [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
- [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6). - [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
@@ -137,19 +136,16 @@ Compute Resources:
- Alibaba Cloud - Alibaba Cloud
- AMD - AMD
- Anyscale - Anyscale
- Arm
- AWS - AWS
- Crusoe Cloud - Crusoe Cloud
- Databricks - Databricks
- DeepInfra - DeepInfra
- Google Cloud - Google Cloud
- IBM
- Intel - Intel
- Lambda Lab - Lambda Lab
- Nebius - Nebius
- Novita AI - Novita AI
- NVIDIA - NVIDIA
- Red Hat
- Replicate - Replicate
- Roblox - Roblox
- RunPod - RunPod

View File

@@ -83,7 +83,7 @@ MIN_CACHE_HIT_PCT=0
MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
``` ```
### 2. Maximize Throughput with a Latency Requirement #### 2. Maximize Throughput with a Latency Requirement
- **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms. - **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms.
- **Configuration**: - **Configuration**:
@@ -96,7 +96,7 @@ MIN_CACHE_HIT_PCT=0
MAX_LATENCY_ALLOWED_MS=500 MAX_LATENCY_ALLOWED_MS=500
``` ```
### 3. Maximize Throughput with Prefix Caching and Latency Requirements #### 3. Maximize Throughput with Prefix Caching and Latency Requirements
- **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms. - **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms.
- **Configuration**: - **Configuration**:

View File

@@ -18,11 +18,6 @@ MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000} MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"} NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"} NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
HOSTNAME=$(hostname)
if [[ -z "$HOSTNAME" ]]; then
echo "Error: Failed to determine hostname." >&2
exit 1
fi
LOG_FOLDER="$BASE/auto-benchmark/$TAG" LOG_FOLDER="$BASE/auto-benchmark/$TAG"
RESULT="$LOG_FOLDER/result.txt" RESULT="$LOG_FOLDER/result.txt"
@@ -87,7 +82,6 @@ start_server() {
"$MODEL" "$MODEL"
"--disable-log-requests" "--disable-log-requests"
"--port" "8004" "--port" "8004"
"--host" "$HOSTNAME"
"--gpu-memory-utilization" "$gpu_memory_utilization" "--gpu-memory-utilization" "$gpu_memory_utilization"
"--max-num-seqs" "$max_num_seqs" "--max-num-seqs" "$max_num_seqs"
"--max-num-batched-tokens" "$max_num_batched_tokens" "--max-num-batched-tokens" "$max_num_batched_tokens"
@@ -102,9 +96,8 @@ start_server() {
# This correctly passes each element as a separate argument. # This correctly passes each element as a separate argument.
if [[ -n "$profile_dir" ]]; then if [[ -n "$profile_dir" ]]; then
# Start server with profiling enabled # Start server with profiling enabled
local profile_config_json="{\"profiler\": \"torch\", \"torch_profiler_dir\": \"$profile_dir\"}" VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
VLLM_SERVER_DEV_MODE=1 \ vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
vllm serve --profiler-config "$profile_config_json" "${common_args_array[@]}" > "$vllm_log" 2>&1 &
else else
# Start server without profiling # Start server without profiling
VLLM_SERVER_DEV_MODE=1 \ VLLM_SERVER_DEV_MODE=1 \
@@ -119,7 +112,7 @@ start_server() {
# since that we should always have permission to send signal to the server process. # since that we should always have permission to send signal to the server process.
kill -0 $server_pid 2> /dev/null || break kill -0 $server_pid 2> /dev/null || break
RESPONSE=$(curl -s -X GET "http://${HOSTNAME}:8004/health" -w "%{http_code}" -o /dev/stdout) RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1) STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
if [[ "$STATUS_CODE" -eq 200 ]]; then if [[ "$STATUS_CODE" -eq 200 ]]; then
server_started=1 server_started=1
@@ -179,7 +172,6 @@ run_benchmark() {
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \ --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 1000 \ --num-prompts 1000 \
--random-prefix-len $prefix_len \ --random-prefix-len $prefix_len \
--host "$HOSTNAME" \
--port 8004 &> "$bm_log" --port 8004 &> "$bm_log"
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}') e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
@@ -195,7 +187,7 @@ run_benchmark() {
request_rate=$((${throughput%.*} + 1)) request_rate=$((${throughput%.*} + 1))
while ((request_rate > 0)); do while ((request_rate > 0)); do
# clear prefix cache # clear prefix cache
curl -X POST http://${HOSTNAME}:8004/reset_prefix_cache curl -X POST http://0.0.0.0:8004/reset_prefix_cache
sleep 5 sleep 5
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt" bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
vllm bench serve \ vllm bench serve \
@@ -211,7 +203,6 @@ run_benchmark() {
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \ --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 100 \ --num-prompts 100 \
--random-prefix-len $prefix_len \ --random-prefix-len $prefix_len \
--host "$HOSTNAME" \
--port 8004 &> "$bm_log" --port 8004 &> "$bm_log"
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}') e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
@@ -312,7 +303,6 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \ --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 100 \ --num-prompts 100 \
--random-prefix-len $prefix_len \ --random-prefix-len $prefix_len \
--host "$HOSTNAME" \
--port 8004 \ --port 8004 \
--profile &> "$bm_log" --profile &> "$bm_log"
else else

View File

@@ -620,7 +620,7 @@ def get_tokenizer(
kwargs["use_fast"] = False kwargs["use_fast"] = False
if tokenizer_mode == "mistral": if tokenizer_mode == "mistral":
try: try:
from vllm.tokenizers.mistral import MistralTokenizer from vllm.transformers_utils.tokenizer import MistralTokenizer
except ImportError as e: except ImportError as e:
raise ImportError( raise ImportError(
"MistralTokenizer requires vllm package.\n" "MistralTokenizer requires vllm package.\n"

View File

@@ -1,120 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Micro benchmark comparing built-in hash(), SHA-256, and xxHash.
This focuses on a single test payload shaped like the prefix-cache hash input:
(32-byte bytes object, 32-int tuple)
Usage:
python benchmarks/hash_micro_benchmark.py --iterations 20000
"""
from __future__ import annotations
import argparse
import random
import statistics
import time
from collections.abc import Callable, Iterable
from vllm.utils.hashing import sha256, xxhash
def _generate_test_data(seed: int) -> tuple[bytes, tuple[int, ...]]:
"""Generate a deterministic test payload."""
random.seed(seed)
bytes_data = bytes(random.getrandbits(8) for _ in range(32))
int_tuple = tuple(random.randint(1, 1_000_000) for _ in range(32))
return (bytes_data, int_tuple)
def _benchmark_func(func: Callable[[tuple], object], data: tuple, iterations: int):
"""Return (avg_seconds, std_seconds) for hashing `data` `iterations` times."""
times: list[float] = []
# Warm-up to avoid first-run noise.
for _ in range(200):
func(data)
for _ in range(iterations):
start = time.perf_counter()
func(data)
end = time.perf_counter()
times.append(end - start)
avg = statistics.mean(times)
std = statistics.stdev(times) if len(times) > 1 else 0.0
return avg, std
def _run_benchmarks(
benchmarks: Iterable[tuple[str, Callable[[tuple], object]]],
data: tuple,
iterations: int,
):
"""Yield (name, avg, std) for each benchmark, skipping unavailable ones."""
for name, func in benchmarks:
try:
avg, std = _benchmark_func(func, data, iterations)
except ModuleNotFoundError as exc:
print(f"Skipping {name}: {exc}")
continue
yield name, avg, std
def builtin_hash(data: tuple) -> int:
"""Wrapper for Python's built-in hash()."""
return hash(data)
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--iterations",
type=int,
default=10_000,
help="Number of measured iterations per hash function.",
)
parser.add_argument(
"--seed", type=int, default=42, help="Random seed for test payload."
)
args = parser.parse_args()
data = _generate_test_data(args.seed)
benchmarks = (
("SHA256 (pickle)", sha256),
("xxHash (pickle)", xxhash),
("built-in hash()", builtin_hash),
)
print("=" * 60)
print("HASH FUNCTION MICRO BENCHMARK")
print("=" * 60)
print("Test data: (32-byte bytes object, 32-int tuple)")
print(f"Iterations: {args.iterations:,}")
print("=" * 60)
results = list(_run_benchmarks(benchmarks, data, args.iterations))
builtin_entry = next((r for r in results if r[0] == "built-in hash()"), None)
print("\nResults:")
for name, avg, std in results:
print(f" {name:16s}: {avg * 1e6:8.2f} ± {std * 1e6:6.2f} μs")
if builtin_entry:
_, builtin_avg, _ = builtin_entry
print("\n" + "=" * 60)
print("SUMMARY (relative to built-in hash())")
print("=" * 60)
for name, avg, _ in results:
if name == "built-in hash()":
continue
speed_ratio = avg / builtin_avg
print(f"{name} is {speed_ratio:.1f}x slower than built-in hash()")
else:
print("\nBuilt-in hash() result missing; cannot compute speed ratios.")
if __name__ == "__main__":
main()

View File

@@ -32,11 +32,12 @@ def benchmark_propose(args):
model_config = ModelConfig( model_config = ModelConfig(
model="facebook/opt-125m", model="facebook/opt-125m",
task="generate",
max_model_len=args.num_token + args.num_spec_token, max_model_len=args.num_token + args.num_spec_token,
tokenizer="facebook/opt-125m", tokenizer="facebook/opt-125m",
tokenizer_mode="auto", tokenizer_mode="auto",
dtype="auto", dtype="auto",
seed=0, seed=None,
trust_remote_code=False, trust_remote_code=False,
) )
proposer = NgramProposer( proposer = NgramProposer(
@@ -107,10 +108,7 @@ def benchmark_batched_propose(args):
device_config=DeviceConfig(device=current_platform.device_type), device_config=DeviceConfig(device=current_platform.device_type),
parallel_config=ParallelConfig(), parallel_config=ParallelConfig(),
load_config=LoadConfig(), load_config=LoadConfig(),
scheduler_config=SchedulerConfig( scheduler_config=SchedulerConfig(),
max_model_len=model_config.max_model_len,
is_encoder_decoder=model_config.is_encoder_decoder,
),
) )
# monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group # monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group

View File

@@ -1,110 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Simple benchmark to compare prefix-cache block hashing algorithms.
Example:
python benchmark_prefix_block_hash.py --num-blocks 20000 --block-size 32
"""
from __future__ import annotations
import argparse
import random
import statistics
import sys
import time
from collections.abc import Callable, Iterable, Sequence
from vllm.utils.hashing import get_hash_fn_by_name
from vllm.v1.core.kv_cache_utils import BlockHash, hash_block_tokens, init_none_hash
SUPPORTED_ALGOS = ("sha256", "sha256_cbor", "xxhash", "xxhash_cbor")
def _generate_blocks(
num_blocks: int, block_size: int, vocab_size: int, seed: int
) -> list[list[int]]:
rng = random.Random(seed)
return [
[rng.randrange(vocab_size) for _ in range(block_size)]
for _ in range(num_blocks)
]
def _hash_all_blocks(
hash_fn: Callable[[object], bytes],
blocks: Iterable[Sequence[int]],
) -> float:
parent_hash: BlockHash | None = None
start = time.perf_counter()
for block in blocks:
parent_hash = hash_block_tokens(hash_fn, parent_hash, block, extra_keys=None)
end = time.perf_counter()
return end - start
def _benchmark(
hash_algo: str,
blocks: list[list[int]],
trials: int,
) -> tuple[float, float, float] | None:
try:
hash_fn = get_hash_fn_by_name(hash_algo)
init_none_hash(hash_fn)
timings = [_hash_all_blocks(hash_fn, blocks) for _ in range(trials)]
except ModuleNotFoundError as exc:
print(f"Skipping {hash_algo}: {exc}", file=sys.stderr)
return None
avg = statistics.mean(timings)
best = min(timings)
# throughput: tokens / second
tokens_hashed = len(blocks) * len(blocks[0])
throughput = tokens_hashed / best
return avg, best, throughput
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--num-blocks", type=int, default=10000, help="Block count.")
parser.add_argument("--block-size", type=int, default=32, help="Tokens per block.")
parser.add_argument(
"--vocab-size", type=int, default=32000, help="Token id range [0, vocab_size)."
)
parser.add_argument("--seed", type=int, default=0, help="Random seed.")
parser.add_argument(
"--trials", type=int, default=5, help="Number of timed trials per algorithm."
)
parser.add_argument(
"--algorithms",
nargs="+",
default=SUPPORTED_ALGOS,
choices=SUPPORTED_ALGOS,
help="Hash algorithms to benchmark.",
)
args = parser.parse_args()
blocks = _generate_blocks(
args.num_blocks, args.block_size, args.vocab_size, args.seed
)
print(
f"Benchmarking {len(args.algorithms)} algorithms on "
f"{args.num_blocks} blocks (block size={args.block_size})."
)
for algo in args.algorithms:
result = _benchmark(algo, blocks, args.trials)
if result is None:
continue
avg, best, throughput = result
print(
f"{algo:14s} avg: {avg:.6f}s best: {best:.6f}s "
f"throughput: {throughput / 1e6:.2f}M tokens/s"
)
if __name__ == "__main__":
main()

View File

@@ -40,7 +40,7 @@ from vllm.engine.arg_utils import EngineArgs
from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.argparse_utils import FlexibleArgumentParser
try: try:
from vllm.tokenizers import get_tokenizer from vllm.transformers_utils.tokenizer import get_tokenizer
except ImportError: except ImportError:
from backend_request_func import get_tokenizer from backend_request_func import get_tokenizer

View File

@@ -46,7 +46,7 @@ from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase from transformers import PreTrainedTokenizerBase
try: try:
from vllm.tokenizers import get_tokenizer from vllm.transformers_utils.tokenizer import get_tokenizer
except ImportError: except ImportError:
from backend_request_func import get_tokenizer from backend_request_func import get_tokenizer
@@ -574,7 +574,7 @@ async def benchmark(
) )
print( print(
"{:<40} {:<10.2f}".format( "{:<40} {:<10.2f}".format(
"Total token throughput (tok/s):", metrics.total_token_throughput "Total Token throughput (tok/s):", metrics.total_token_throughput
) )
) )
@@ -963,7 +963,8 @@ def create_argument_parser():
parser.add_argument( parser.add_argument(
"--profile", "--profile",
action="store_true", action="store_true",
help="Use vLLM Profiling. --profiler-config must be provided on the server.", help="Use Torch Profiler. The endpoint must be launched with "
"VLLM_TORCH_PROFILER_DIR to enable profiler.",
) )
parser.add_argument( parser.add_argument(
"--result-dir", "--result-dir",

View File

@@ -5,12 +5,11 @@ import argparse
import asyncio import asyncio
import logging import logging
import os import os
import time
import uuid
from urllib.parse import urlparse
import aiohttp import aiohttp
from quart import Quart, Response, make_response, request from quart import Quart, Response, make_response, request
from rate_limiter import RateLimiter
from request_queue import RequestQueue
# Configure logging # Configure logging
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
@@ -25,8 +24,26 @@ def parse_args():
parser.add_argument( parser.add_argument(
"--timeout", "--timeout",
type=float, type=float,
default=6 * 60 * 60, default=300,
help="Timeout for backend service requests in seconds (default: 21600)", help="Timeout for backend service requests in seconds (default: 300)",
)
parser.add_argument(
"--max-concurrent",
type=int,
default=100,
help="Maximum concurrent requests to backend services (default: 100)",
)
parser.add_argument(
"--queue-size",
type=int,
default=500,
help="Maximum number of requests in the queue (default: 500)",
)
parser.add_argument(
"--rate-limit",
type=int,
default=40,
help="Maximum requests per second (default: 40)",
) )
parser.add_argument( parser.add_argument(
"--port", "--port",
@@ -37,32 +54,14 @@ def parse_args():
parser.add_argument( parser.add_argument(
"--prefill-url", "--prefill-url",
type=str, type=str,
default="http://localhost:8100", default="http://localhost:8100/v1/completions",
help="Prefill service base URL (protocol + host[:port])", help="Prefill service endpoint URL",
) )
parser.add_argument( parser.add_argument(
"--decode-url", "--decode-url",
type=str, type=str,
default="http://localhost:8200", default="http://localhost:8200/v1/completions",
help="Decode service base URL (protocol + host[:port])", help="Decode service endpoint URL",
)
parser.add_argument(
"--kv-host",
type=str,
default="localhost",
help="Hostname or IP used by KV transfer (default: localhost)",
)
parser.add_argument(
"--prefill-kv-port",
type=int,
default=14579,
help="Prefill KV port (default: 14579)",
)
parser.add_argument(
"--decode-kv-port",
type=int,
default=14580,
help="Decode KV port (default: 14580)",
) )
return parser.parse_args() return parser.parse_args()
@@ -74,129 +73,70 @@ def main():
# Initialize configuration using command line parameters # Initialize configuration using command line parameters
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=args.timeout) AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=args.timeout)
MAX_CONCURRENT_REQUESTS = args.max_concurrent
REQUEST_QUEUE_SIZE = args.queue_size
RATE_LIMIT = args.rate_limit
PREFILL_SERVICE_URL = args.prefill_url PREFILL_SERVICE_URL = args.prefill_url
DECODE_SERVICE_URL = args.decode_url DECODE_SERVICE_URL = args.decode_url
PORT = args.port PORT = args.port
PREFILL_KV_ADDR = f"{args.kv_host}:{args.prefill_kv_port}"
DECODE_KV_ADDR = f"{args.kv_host}:{args.decode_kv_port}"
logger.info(
"Proxy resolved KV addresses -> prefill: %s, decode: %s",
PREFILL_KV_ADDR,
DECODE_KV_ADDR,
)
app = Quart(__name__) app = Quart(__name__)
# Attach the configuration object to the application instance so helper # Initialize the rate limiter and request queue
# coroutines can read the resolved backend URLs and timeouts without using rate_limiter = RateLimiter(RATE_LIMIT)
# globals. request_queue = RequestQueue(MAX_CONCURRENT_REQUESTS, REQUEST_QUEUE_SIZE)
# Attach the configuration object to the application instance
app.config.update( app.config.update(
{ {
"AIOHTTP_TIMEOUT": AIOHTTP_TIMEOUT, "AIOHTTP_TIMEOUT": AIOHTTP_TIMEOUT,
"rate_limiter": rate_limiter,
"request_queue": request_queue,
"PREFILL_SERVICE_URL": PREFILL_SERVICE_URL, "PREFILL_SERVICE_URL": PREFILL_SERVICE_URL,
"DECODE_SERVICE_URL": DECODE_SERVICE_URL, "DECODE_SERVICE_URL": DECODE_SERVICE_URL,
"PREFILL_KV_ADDR": PREFILL_KV_ADDR,
"DECODE_KV_ADDR": DECODE_KV_ADDR,
} }
) )
def _normalize_base_url(url: str) -> str: # Start queue processing on app startup
"""Remove any trailing slash so path joins behave predictably.""" @app.before_serving
return url.rstrip("/") async def startup():
"""Start request processing task when app starts serving"""
asyncio.create_task(request_queue.process())
def _get_host_port(url: str) -> str: async def forward_request(url, data):
"""Return the hostname:port portion for logging and KV headers.""" """Forward request to backend service with rate limiting and error handling"""
parsed = urlparse(url) headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
host = parsed.hostname or "localhost"
port = parsed.port
if port is None:
port = 80 if parsed.scheme == "http" else 443
return f"{host}:{port}"
PREFILL_BASE = _normalize_base_url(PREFILL_SERVICE_URL) # Use rate limiter as context manager
DECODE_BASE = _normalize_base_url(DECODE_SERVICE_URL) async with (
KV_TARGET = _get_host_port(DECODE_SERVICE_URL) rate_limiter,
aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
def _build_headers(request_id: str) -> dict[str, str]: ):
"""Construct the headers expected by vLLM's P2P disagg connector.""" try:
headers: dict[str, str] = {"X-Request-Id": request_id, "X-KV-Target": KV_TARGET} async with session.post(
api_key = os.environ.get("OPENAI_API_KEY") url=url, json=data, headers=headers
if api_key: ) as response:
headers["Authorization"] = f"Bearer {api_key}" if response.status == 200:
return headers # Stream response chunks
async for chunk_bytes in response.content.iter_chunked(1024):
async def _run_prefill( yield chunk_bytes
request_path: str, else:
payload: dict, # Handle backend service errors
headers: dict[str, str], error_text = await response.text()
request_id: str, logger.error(
): "Backend service error: %s - %s",
url = f"{PREFILL_BASE}{request_path}" response.status,
start_ts = time.perf_counter() error_text,
logger.info("[prefill] start request_id=%s url=%s", request_id, url) )
try: yield b'{"error": "Backend service error"}'
async with ( except aiohttp.ClientError as e:
aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session, # Handle connection errors
session.post(url=url, json=payload, headers=headers) as resp, logger.error("Connection error to %s: %s", url, str(e))
): yield b'{"error": "Service unavailable"}'
if resp.status != 200: except asyncio.TimeoutError:
error_text = await resp.text() # Handle timeout errors
raise RuntimeError( logger.error("Timeout connecting to %s", url)
f"Prefill backend error {resp.status}: {error_text}" yield b'{"error": "Service timeout"}'
)
await resp.read()
logger.info(
"[prefill] done request_id=%s status=%s elapsed=%.2fs",
request_id,
resp.status,
time.perf_counter() - start_ts,
)
except asyncio.TimeoutError as exc:
raise RuntimeError(f"Prefill service timeout at {url}") from exc
except aiohttp.ClientError as exc:
raise RuntimeError(f"Prefill service unavailable at {url}") from exc
async def _stream_decode(
request_path: str,
payload: dict,
headers: dict[str, str],
request_id: str,
):
url = f"{DECODE_BASE}{request_path}"
# Stream tokens from the decode service once the prefill stage has
# materialized KV caches on the target workers.
logger.info("[decode] start request_id=%s url=%s", request_id, url)
try:
async with (
aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
session.post(url=url, json=payload, headers=headers) as resp,
):
if resp.status != 200:
error_text = await resp.text()
logger.error(
"Decode backend error %s - %s", resp.status, error_text
)
err_msg = (
'{"error": "Decode backend error ' + str(resp.status) + '"}'
)
yield err_msg.encode()
return
logger.info(
"[decode] streaming response request_id=%s status=%s",
request_id,
resp.status,
)
async for chunk_bytes in resp.content.iter_chunked(1024):
yield chunk_bytes
logger.info("[decode] finished streaming request_id=%s", request_id)
except asyncio.TimeoutError:
logger.error("Decode service timeout at %s", url)
yield b'{"error": "Decode service timeout"}'
except aiohttp.ClientError as exc:
logger.error("Decode service error at %s: %s", url, exc)
yield b'{"error": "Decode service unavailable"}'
async def process_request(): async def process_request():
"""Process a single request through prefill and decode stages""" """Process a single request through prefill and decode stages"""
@@ -206,27 +146,13 @@ def main():
# Create prefill request (max_tokens=1) # Create prefill request (max_tokens=1)
prefill_request = original_request_data.copy() prefill_request = original_request_data.copy()
prefill_request["max_tokens"] = 1 prefill_request["max_tokens"] = 1
if "max_completion_tokens" in prefill_request:
prefill_request["max_completion_tokens"] = 1
# Execute prefill stage # Execute prefill stage
# The request id encodes both KV socket addresses so the backend can async for _ in forward_request(PREFILL_SERVICE_URL, prefill_request):
# shuttle tensors directly via NCCL once the prefill response continue
# completes.
request_id = (
f"___prefill_addr_{PREFILL_KV_ADDR}___decode_addr_"
f"{DECODE_KV_ADDR}_{uuid.uuid4().hex}"
)
headers = _build_headers(request_id)
await _run_prefill(request.path, prefill_request, headers, request_id)
# Execute decode stage and stream response # Execute decode stage and stream response
# Pass the unmodified user request so the decode phase can continue generator = forward_request(DECODE_SERVICE_URL, original_request_data)
# sampling with the already-populated KV cache.
generator = _stream_decode(
request.path, original_request_data, headers, request_id
)
response = await make_response(generator) response = await make_response(generator)
response.timeout = None # Disable timeout for streaming response response.timeout = None # Disable timeout for streaming response
return response return response
@@ -242,10 +168,23 @@ def main():
@app.route("/v1/completions", methods=["POST"]) @app.route("/v1/completions", methods=["POST"])
async def handle_request(): async def handle_request():
"""Handle incoming API requests with concurrency and rate limiting""" """Handle incoming API requests with concurrency and rate limiting"""
# Create task for request processing
task = asyncio.create_task(process_request())
# Enqueue request or reject if queue is full
if not await request_queue.enqueue(task):
return Response(
response=b'{"error": "Server busy, try again later"}',
status=503,
content_type="application/json",
)
try: try:
return await process_request() # Return the response from the processing task
return await task
except asyncio.CancelledError: except asyncio.CancelledError:
logger.warning("Request cancelled") # Handle task cancellation (timeout or queue full)
logger.warning("Request cancelled due to timeout or queue full")
return Response( return Response(
response=b'{"error": "Request cancelled"}', response=b'{"error": "Request cancelled"}',
status=503, status=503,

View File

@@ -14,9 +14,6 @@ from tqdm import tqdm
import vllm._custom_ops as ops import vllm._custom_ops as ops
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
per_token_group_quant_fp8,
)
@dataclass @dataclass
@@ -25,7 +22,6 @@ class bench_params_t:
hidden_size: int hidden_size: int
add_residual: bool add_residual: bool
dtype: torch.dtype dtype: torch.dtype
group_size: list[int]
def description(self): def description(self):
return ( return (
@@ -33,7 +29,6 @@ class bench_params_t:
f"x D {self.hidden_size} " f"x D {self.hidden_size} "
f"x R {self.add_residual} " f"x R {self.add_residual} "
f"x DT {self.dtype}" f"x DT {self.dtype}"
f"x GS {self.group_size}"
) )
@@ -43,11 +38,10 @@ def get_bench_params() -> list[bench_params_t]:
HIDDEN_SIZES = list(range(1024, 8129, 1024)) HIDDEN_SIZES = list(range(1024, 8129, 1024))
ADD_RESIDUAL = [True, False] ADD_RESIDUAL = [True, False]
DTYPES = [torch.bfloat16, torch.float] DTYPES = [torch.bfloat16, torch.float]
GROUP_SIZES = [[1, 64], [1, 128]]
combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES, GROUP_SIZES) combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES)
bench_params = list( bench_params = list(
map(lambda x: bench_params_t(x[0], x[1], x[2], x[3], x[4]), combinations) map(lambda x: bench_params_t(x[0], x[1], x[2], x[3]), combinations)
) )
return bench_params return bench_params
@@ -58,7 +52,6 @@ def unfused_int8_impl(
x: torch.Tensor, x: torch.Tensor,
residual: torch.Tensor | None, residual: torch.Tensor | None,
quant_dtype: torch.dtype, quant_dtype: torch.dtype,
group_size: list[int],
): ):
# Norm # Norm
torch_out = None torch_out = None
@@ -76,7 +69,6 @@ def unfused_fp8_impl(
x: torch.Tensor, x: torch.Tensor,
residual: torch.Tensor | None, residual: torch.Tensor | None,
quant_dtype: torch.dtype, quant_dtype: torch.dtype,
group_size: list[int],
): ):
# Norm # Norm
torch_out = None torch_out = None
@@ -89,63 +81,23 @@ def unfused_fp8_impl(
torch_out, _ = ops.scaled_fp8_quant(torch_out) torch_out, _ = ops.scaled_fp8_quant(torch_out)
def unfused_groupwise_fp8_impl(
rms_norm_layer: RMSNorm,
x: torch.Tensor,
residual: torch.Tensor | None,
quant_dtype: torch.dtype,
group_size: list[int],
):
# Norm
torch_out = None
if residual is None:
torch_out = rms_norm_layer.forward_cuda(x, residual)
else:
torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
# Quant
torch_out, _ = per_token_group_quant_fp8(
torch_out, group_size=group_size[1], use_ue8m0=False
)
def fused_impl( def fused_impl(
rms_norm_layer: RMSNorm, # this stores the weights rms_norm_layer: RMSNorm, # this stores the weights
x: torch.Tensor, x: torch.Tensor,
residual: torch.Tensor | None, residual: torch.Tensor | None,
quant_dtype: torch.dtype, quant_dtype: torch.dtype,
group_size: list[int],
): ):
out, _ = ops.rms_norm_dynamic_per_token_quant( out, _ = ops.rms_norm_dynamic_per_token_quant(
x, rms_norm_layer.weight, 1e-6, quant_dtype, residual=residual x, rms_norm_layer.weight, 1e-6, quant_dtype, residual=residual
) )
def fused_groupwise_impl(
rms_norm_layer: RMSNorm, # this stores the weights
x: torch.Tensor,
residual: torch.Tensor | None,
quant_dtype: torch.dtype,
group_size: list[int],
):
out, _ = ops.rms_norm_per_block_quant(
x,
rms_norm_layer.weight,
1e-6,
quant_dtype,
group_size,
residual=residual,
is_scale_transposed=True,
)
# Bench functions # Bench functions
def bench_fn( def bench_fn(
rms_norm_layer: RMSNorm, rms_norm_layer: RMSNorm,
x: torch.Tensor, x: torch.Tensor,
residual: torch.Tensor, residual: torch.Tensor,
quant_dtype: torch.dtype, quant_dtype: torch.dtype,
group_size: list[int],
label: str, label: str,
sub_label: str, sub_label: str,
fn: Callable, fn: Callable,
@@ -158,11 +110,10 @@ def bench_fn(
"x": x, "x": x,
"residual": residual, "residual": residual,
"quant_dtype": quant_dtype, "quant_dtype": quant_dtype,
"group_size": group_size,
"fn": fn, "fn": fn,
} }
return TBenchmark.Timer( return TBenchmark.Timer(
stmt="fn(rms_norm_layer, x, residual, quant_dtype, group_size)", stmt="fn(rms_norm_layer, x, residual, quant_dtype)",
globals=globals, globals=globals,
label=label, label=label,
sub_label=sub_label, sub_label=sub_label,
@@ -196,7 +147,6 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
x, x,
residual, residual,
torch.int8, torch.int8,
params.group_size,
label, label,
sub_label, sub_label,
unfused_int8_impl, unfused_int8_impl,
@@ -211,7 +161,6 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
x, x,
residual, residual,
torch.float8_e4m3fn, torch.float8_e4m3fn,
params.group_size,
label, label,
sub_label, sub_label,
unfused_fp8_impl, unfused_fp8_impl,
@@ -226,7 +175,6 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
x, x,
residual, residual,
torch.int8, torch.int8,
params.group_size,
label, label,
sub_label, sub_label,
fused_impl, fused_impl,
@@ -241,7 +189,6 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
x, x,
residual, residual,
torch.float8_e4m3fn, torch.float8_e4m3fn,
params.group_size,
label, label,
sub_label, sub_label,
fused_impl, fused_impl,
@@ -249,36 +196,6 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
) )
) )
# unfused groupwise fp8 impl.
timers.append(
bench_fn(
layer,
x,
residual,
torch.float8_e4m3fn,
params.group_size,
label,
sub_label,
unfused_groupwise_fp8_impl,
"unfused_groupwise_fp8_impl",
)
)
# fused groupwise fp8 impl.
timers.append(
bench_fn(
layer,
x,
residual,
torch.float8_e4m3fn,
params.group_size,
label,
sub_label,
fused_groupwise_impl,
"fused_groupwise_fp8_impl",
)
)
print_timers(timers) print_timers(timers)
return timers return timers

View File

@@ -1,244 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import dataclass
from enum import Enum
from itertools import product
from typing import Any
import torch
import torch.utils.benchmark as TBenchmark
from torch.utils.benchmark import Measurement as TMeasurement
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
_per_token_group_quant_fp8_colmajor,
silu_mul_per_token_group_quant_fp8_colmajor,
)
from vllm.triton_utils import triton
from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
from .utils import ArgPool, Bench, CudaGraphBenchParams
GROUP_SIZE = 128
FLOAT8_T = torch.float8_e4m3fn
def print_timers(timers: list[TMeasurement], cuda_graph_nops: int):
print(
f"Note : The timings reported above is for {cuda_graph_nops} "
"consecutive invocations of the benchmarking functions. "
f"Please divide by {cuda_graph_nops} for single invocation "
"timings."
)
compare = TBenchmark.Compare(timers)
compare.print()
class ImplType(Enum):
SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR = 1
REFERENCE = 2
def get_impl(self):
if self == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR:
return silu_mul_per_token_group_quant_fp8_colmajor
elif self == ImplType.REFERENCE:
return reference
raise ValueError(f"Unrecognized ImplType {self}")
@dataclass
class BenchmarkTensors:
input: torch.Tensor
output: torch.Tensor
# Reference act output tensor
ref_act_out: torch.Tensor
ref_quant_out: torch.Tensor
@staticmethod
def make(T: int, N: int) -> "BenchmarkTensors":
assert T % GROUP_SIZE == 0
assert N % (GROUP_SIZE * 2) == 0
input = torch.rand((T, N), dtype=torch.bfloat16, device="cuda")
# silu_mul_per_token_group_quant_fp8_colmajor output.
output = torch.rand((T, N // 2), dtype=torch.bfloat16, device="cuda").to(
FLOAT8_T
)
# reference output.
ref_act_out = torch.empty((T, N // 2), dtype=torch.bfloat16, device="cuda")
ref_quant_out = torch.empty(
(T, N // 2), dtype=torch.bfloat16, device="cuda"
).to(FLOAT8_T)
return BenchmarkTensors(
input=input,
output=output,
ref_act_out=ref_act_out,
ref_quant_out=ref_quant_out,
)
@property
def T(self):
return self.input.size(0)
@property
def N(self):
return self.input.size(1)
def make_impl_kwargs(self, impl_type: ImplType) -> dict[str, Any]:
if impl_type == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR:
return {
"input": self.input,
"output": self.output,
"use_ue8m0": is_deep_gemm_e8m0_used(),
}
elif impl_type == ImplType.REFERENCE:
return {
"input": self.input,
"act_out": self.ref_act_out,
"quant_out": self.ref_quant_out,
"use_ue8m0": is_deep_gemm_e8m0_used(),
}
raise ValueError(f"Unrecognized impl_type {impl_type}")
def reference_quant(x: torch.Tensor, quant_out: torch.Tensor, use_ue8m0: bool):
"""
Reference triton quant kernel from,
vllm.model_executor.layers.quantization.utils.fp8_utils
"""
assert quant_out.size() == x.size()
# Allocate the scale tensor column-major format.
shape = (x.shape[-1] // GROUP_SIZE,) + x.shape[:-1]
x_q = quant_out
x_s = torch.empty(shape, device=x.device, dtype=torch.float32).permute(-1, -2)
M = x.numel() // GROUP_SIZE
N = GROUP_SIZE
BLOCK = triton.next_power_of_2(N)
# heuristics for number of warps
num_warps = min(max(BLOCK // 256, 1), 8)
num_stages = 1
finfo = torch.finfo(FLOAT8_T)
fp8_min = finfo.min
fp8_max = finfo.max
_per_token_group_quant_fp8_colmajor[(M,)](
x,
x_q,
x_s,
GROUP_SIZE,
x.shape[1],
x.stride(0),
x_s.stride(1),
eps=1e-10,
fp8_min=fp8_min,
fp8_max=fp8_max,
use_ue8m0=use_ue8m0,
BLOCK=BLOCK,
num_warps=num_warps,
num_stages=num_stages,
)
return x_q, x_s
def reference(
input: torch.Tensor,
act_out: torch.Tensor,
quant_out: torch.Tensor,
use_ue8m0: bool,
) -> tuple[torch.Tensor, torch.Tensor]:
torch.ops._C.silu_and_mul(act_out, input)
return reference_quant(act_out, quant_out, use_ue8m0)
def bench_impl(
bench_tensors: list[BenchmarkTensors], impl_type: ImplType
) -> TMeasurement:
T = bench_tensors[0].T
N = bench_tensors[0].N
arg_pool_size = len(bench_tensors)
kwargs_list = [bt.make_impl_kwargs(impl_type) for bt in bench_tensors]
# warmup
for kwargs in kwargs_list:
impl_type.get_impl()(**kwargs)
torch.cuda.synchronize()
# Merge into a single kwargs and qualify arguments as ArgPool
kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
for _kwargs in kwargs_list:
for k, v in _kwargs.items():
kwargs[k].values.append(v)
cuda_graph_params = None
cuda_graph_params = CudaGraphBenchParams(arg_pool_size)
timer = None
with Bench(
cuda_graph_params,
"silu-mul-quant",
f"num_tokens={T}, N={N}",
impl_type.name,
impl_type.get_impl(),
**kwargs,
) as bench:
timer = bench.run()
return timer
def test_correctness(T: int, N: int):
print(f"Testing num_tokens={T}, N={N} ...")
bench_tensor = BenchmarkTensors.make(T, N)
def output_from_impl(impl: ImplType) -> tuple[torch.Tensor, torch.Tensor]:
return impl.get_impl()(**bench_tensor.make_impl_kwargs(impl))
# reference output
ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE)
# test ouptut
out_q, out_s = output_from_impl(
ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
)
torch.testing.assert_close(ref_out_q.to(torch.float32), out_q.to(torch.float32))
torch.testing.assert_close(ref_out_s, out_s)
def run(Ts: list[int], Ns: list[int], arg_pool_size: int) -> list[TMeasurement]:
timers = []
for N, T in product(Ns, Ts):
test_correctness(T, N)
bench_tensors: list[BenchmarkTensors] = [
BenchmarkTensors.make(T, N) for _ in range(arg_pool_size)
]
silu_mul_quant_timer = bench_impl(
bench_tensors, ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
)
timers.append(silu_mul_quant_timer)
reference_timer = bench_impl(bench_tensors, ImplType.REFERENCE)
timers.append(reference_timer)
print_timers(
[silu_mul_quant_timer, reference_timer], cuda_graph_nops=arg_pool_size
)
print_timers(timers, cuda_graph_nops=arg_pool_size)
return timers
if __name__ == "__main__":
T = [128 * i for i in range(1, 16)] + [2048 * i for i in range(1, 65)]
N = [2048, 4096, 8192]
print(f"T = {T}, N = {N}")
run(T, N, arg_pool_size=8)

View File

@@ -255,8 +255,8 @@ def bench_run(
torch.cuda.synchronize() torch.cuda.synchronize()
# Timing # Timing
start_event = torch.Event(enable_timing=True) start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.Event(enable_timing=True) end_event = torch.cuda.Event(enable_timing=True)
latencies = [] latencies = []
for _ in range(num_iters): for _ in range(num_iters):

View File

@@ -237,7 +237,6 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
b_q_weight=w_q, b_q_weight=w_q,
b_bias=None, b_bias=None,
b_scales=w_s, b_scales=w_s,
a_scales=None,
global_scale=None, global_scale=None,
b_zeros=w_zp, b_zeros=w_zp,
g_idx=g_idx, g_idx=g_idx,

View File

@@ -263,7 +263,7 @@ def bench_run(
results.append( results.append(
benchmark.Timer( benchmark.Timer(
stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501 stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501
globals=globals, globals=globals,
label=label, label=label,
sub_label=sub_label, sub_label=sub_label,
@@ -273,7 +273,7 @@ def bench_run(
results.append( results.append(
benchmark.Timer( benchmark.Timer(
stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501 stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501
globals=globals, globals=globals,
label=label, label=label,
sub_label=sub_label, sub_label=sub_label,

View File

@@ -1,150 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Benchmark script comparing torch.cat vs direct copy for k_nope/k_pe concatenation
in MLA (Multi-head Latent Attention) prefill.
This validates that the optimization from commit 8d4142bd is beneficial across
various batch sizes, not just the originally tested batch size of 32768.
"""
import time
from collections.abc import Callable
import torch
# DeepSeek-V3 MLA dimensions
NUM_HEADS = 128
QK_NOPE_HEAD_DIM = 128
PE_DIM = 64
def cat_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
"""Original torch.cat approach with expand."""
return torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
def direct_copy_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
"""Optimized direct copy approach (avoids expand + cat overhead)."""
k = torch.empty(
(*k_nope.shape[:-1], k_nope.shape[-1] + k_pe.shape[-1]),
dtype=k_nope.dtype,
device=k_nope.device,
)
k[..., : k_nope.shape[-1]] = k_nope
k[..., k_nope.shape[-1] :] = k_pe
return k
def benchmark_method(
method: Callable,
k_nope: torch.Tensor,
k_pe: torch.Tensor,
num_warmup: int = 10,
num_iters: int = 100,
) -> float:
"""Benchmark a concatenation method and return mean latency in ms."""
# Warmup
for _ in range(num_warmup):
_ = method(k_nope, k_pe)
torch.cuda.synchronize()
# Benchmark
start = time.perf_counter()
for _ in range(num_iters):
_ = method(k_nope, k_pe)
torch.cuda.synchronize()
end = time.perf_counter()
return (end - start) / num_iters * 1000 # Convert to ms
@torch.inference_mode()
def run_benchmark(dtype: torch.dtype, dtype_name: str):
"""Run benchmark for a specific dtype."""
torch.set_default_device("cuda")
# Batch sizes to test (powers of 2 from 32 to 65536)
batch_sizes = [32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536]
print("=" * 80)
print("Benchmark: torch.cat vs direct copy for MLA k_nope/k_pe concatenation")
print("=" * 80)
print(
f"Tensor shapes: k_nope=[B, {NUM_HEADS}, {QK_NOPE_HEAD_DIM}], "
f"k_pe=[B, 1, {PE_DIM}]"
)
print(f"dtype: {dtype_name}")
print()
print(
f"{'Batch Size':>12} | {'cat (ms)':>10} | {'direct (ms)':>12} | "
f"{'Speedup':>8} | {'Reduction':>10}"
)
print("-" * 70)
results = []
for batch_size in batch_sizes:
# Create input tensors (generate in float32 then convert for FP8 compatibility)
k_nope = torch.randn(
batch_size, NUM_HEADS, QK_NOPE_HEAD_DIM, dtype=torch.float32, device="cuda"
).to(dtype)
k_pe = torch.randn(
batch_size, 1, PE_DIM, dtype=torch.float32, device="cuda"
).to(dtype)
# Benchmark both methods
cat_time = benchmark_method(cat_method, k_nope, k_pe)
direct_time = benchmark_method(direct_copy_method, k_nope, k_pe)
speedup = cat_time / direct_time
reduction = (1 - direct_time / cat_time) * 100
results.append((batch_size, cat_time, direct_time, speedup, reduction))
print(
f"{batch_size:>12} | {cat_time:>10.3f} | {direct_time:>12.3f} | "
f"{speedup:>7.2f}x | {reduction:>9.1f}%"
)
print("=" * 80)
# Summary statistics
speedups = [r[3] for r in results]
print("\nSpeedup summary:")
print(f" Min: {min(speedups):.2f}x")
print(f" Max: {max(speedups):.2f}x")
print(f" Mean: {sum(speedups) / len(speedups):.2f}x")
# Find crossover point
crossover_batch = None
for batch_size, _, _, speedup, _ in results:
if speedup >= 1.0:
crossover_batch = batch_size
break
print("\nConclusion:")
if crossover_batch:
print(f" - Direct copy becomes beneficial at batch size >= {crossover_batch}")
# Filter for large batches (>= 512 which is typical for prefill)
large_batch_speedups = [r[3] for r in results if r[0] >= 512]
if large_batch_speedups:
avg_large = sum(large_batch_speedups) / len(large_batch_speedups)
print(f" - For batch sizes >= 512: avg speedup = {avg_large:.2f}x")
print(" - MLA prefill typically uses large batches, so optimization is effective")
return results
@torch.inference_mode()
def main():
# Test bfloat16
print("\n")
run_benchmark(torch.bfloat16, "bfloat16")
# Test float8_e4m3fn
print("\n")
run_benchmark(torch.float8_e4m3fn, "float8_e4m3fn")
if __name__ == "__main__":
main()

View File

@@ -185,8 +185,8 @@ def benchmark_config(
graph.replay() graph.replay()
torch.cuda.synchronize() torch.cuda.synchronize()
start_event = torch.Event(enable_timing=True) start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.Event(enable_timing=True) end_event = torch.cuda.Event(enable_timing=True)
latencies: list[float] = [] latencies: list[float] = []
for i in range(num_iters): for i in range(num_iters):

View File

@@ -24,15 +24,12 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
num_tokens_range = [1, 16, 256, 4096] num_tokens_range = [1, 16, 256, 4096]
num_experts_range = [16, 64, 224, 256, 280, 512] num_experts_range = [16, 64, 224, 256, 280, 512]
topk_range = [1, 2, 8] topk_range = [1, 2, 8]
ep_size_range = [1, 8] configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
configs = list(
itertools.product(num_tokens_range, num_experts_range, topk_range, ep_size_range)
)
@triton.testing.perf_report( @triton.testing.perf_report(
triton.testing.Benchmark( triton.testing.Benchmark(
x_names=["num_tokens", "num_experts", "topk", "ep_size"], x_names=["num_tokens", "num_experts", "topk"],
x_vals=configs, x_vals=configs,
line_arg="provider", line_arg="provider",
line_vals=["vllm"], line_vals=["vllm"],
@@ -41,26 +38,16 @@ configs = list(
args={}, args={},
) )
) )
def benchmark(num_tokens, num_experts, topk, ep_size, provider): def benchmark(num_tokens, num_experts, topk, provider):
"""Benchmark function for Triton.""" """Benchmark function for Triton."""
block_size = 256 block_size = 256
torch.cuda.manual_seed_all(0)
topk_ids = get_topk_ids(num_tokens, num_experts, topk) topk_ids = get_topk_ids(num_tokens, num_experts, topk)
e_map = None
if ep_size != 1:
local_e = num_experts // ep_size
e_ids = torch.randperm(num_experts, device="cuda", dtype=torch.int32)[:local_e]
e_map = torch.full((num_experts,), -1, device="cuda", dtype=torch.int32)
e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32)
quantiles = [0.5, 0.2, 0.8] quantiles = [0.5, 0.2, 0.8]
if provider == "vllm": if provider == "vllm":
ms, min_ms, max_ms = triton.testing.do_bench( ms, min_ms, max_ms = triton.testing.do_bench(
lambda: moe_align_block_size( lambda: moe_align_block_size(topk_ids, block_size, num_experts),
topk_ids, block_size, num_experts, e_map, ignore_invalid_experts=True
),
quantiles=quantiles, quantiles=quantiles,
) )

View File

@@ -105,8 +105,8 @@ def benchmark_permute(
graph.replay() graph.replay()
torch.cuda.synchronize() torch.cuda.synchronize()
start_event = torch.Event(enable_timing=True) start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.Event(enable_timing=True) end_event = torch.cuda.Event(enable_timing=True)
latencies: list[float] = [] latencies: list[float] = []
for i in range(num_iters): for i in range(num_iters):
@@ -241,8 +241,8 @@ def benchmark_unpermute(
graph.replay() graph.replay()
torch.cuda.synchronize() torch.cuda.synchronize()
start_event = torch.Event(enable_timing=True) start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.Event(enable_timing=True) end_event = torch.cuda.Event(enable_timing=True)
latencies: list[float] = [] latencies: list[float] = []
for i in range(num_iters): for i in range(num_iters):

View File

@@ -6,7 +6,7 @@
# #
# The CSV file (named with current date/time) contains these columns: # The CSV file (named with current date/time) contains these columns:
# model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position, # model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position,
# is_neox_style, rope_parameters, dtype, torch_mean, torch_median, torch_p99, # rope_theta, is_neox_style, rope_scaling, dtype, torch_mean, torch_median, torch_p99,
# torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max, # torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max,
# speedup # speedup
# #
@@ -86,8 +86,9 @@ def benchmark_mrope(
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
max_position: int = 8192, max_position: int = 8192,
rope_theta: float = 10000,
is_neox_style: bool = True, is_neox_style: bool = True,
rope_parameters: dict[str, Any] | None = None, rope_scaling: dict[str, Any] = None,
dtype: torch.dtype = torch.bfloat16, dtype: torch.dtype = torch.bfloat16,
seed: int = 0, seed: int = 0,
warmup_iter: int = 10, warmup_iter: int = 10,
@@ -99,9 +100,11 @@ def benchmark_mrope(
# the parameters to compute the q k v size based on tp_size # the parameters to compute the q k v size based on tp_size
mrope_helper_class = get_rope( mrope_helper_class = get_rope(
head_size=head_dim, head_size=head_dim,
rotary_dim=head_dim,
max_position=max_position, max_position=max_position,
base=rope_theta,
is_neox_style=is_neox_style, is_neox_style=is_neox_style,
rope_parameters=rope_parameters, rope_scaling=rope_scaling,
dtype=dtype, dtype=dtype,
).to(device=device) ).to(device=device)
@@ -200,8 +203,9 @@ def benchmark_mrope(
num_kv_heads, num_kv_heads,
head_dim, head_dim,
max_position, max_position,
rope_theta,
is_neox_style, is_neox_style,
str(rope_parameters), str(rope_scaling),
str(dtype).split(".")[-1], str(dtype).split(".")[-1],
torch_stats["mean"], torch_stats["mean"],
torch_stats["median"], torch_stats["median"],
@@ -251,8 +255,9 @@ if __name__ == "__main__":
"num_kv_heads", "num_kv_heads",
"head_dim", "head_dim",
"max_position", "max_position",
"rope_theta",
"is_neox_style", "is_neox_style",
"rope_parameters", "rope_scaling",
"dtype", "dtype",
"torch_mean", "torch_mean",
"torch_median", "torch_median",
@@ -298,7 +303,7 @@ if __name__ == "__main__":
q_size = num_heads * head_dim q_size = num_heads * head_dim
kv_size = num_kv_heads * head_dim kv_size = num_kv_heads * head_dim
is_neox_style = True is_neox_style = True
rope_parameters = config.rope_parameters rope_theta = config.rope_theta
max_position = config.max_position_embeddings max_position = config.max_position_embeddings
for num_tokens in num_tokens_list: for num_tokens in num_tokens_list:
@@ -310,8 +315,9 @@ if __name__ == "__main__":
num_heads=num_heads, num_heads=num_heads,
num_kv_heads=num_kv_heads, num_kv_heads=num_kv_heads,
max_position=max_position, max_position=max_position,
rope_theta=rope_theta,
is_neox_style=is_neox_style, is_neox_style=is_neox_style,
rope_parameters=rope_parameters, rope_scaling=config.rope_scaling,
dtype=getattr(torch, args.dtype), dtype=getattr(torch, args.dtype),
seed=args.seed, seed=args.seed,
warmup_iter=args.warmup_iter, warmup_iter=args.warmup_iter,

View File

@@ -30,8 +30,8 @@ def _time_cuda(
fn() fn()
torch.cuda.synchronize() torch.cuda.synchronize()
start = torch.Event(enable_timing=True) start = torch.cuda.Event(enable_timing=True)
end = torch.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True)
start.record() start.record()
for _ in range(bench_iters): for _ in range(bench_iters):

View File

@@ -32,8 +32,8 @@ def get_benchmark(head_size, rotary_dim, is_neox_style, device):
def benchmark(batch_size, seq_len, num_heads, provider): def benchmark(batch_size, seq_len, num_heads, provider):
dtype = torch.bfloat16 dtype = torch.bfloat16
max_position = 8192 max_position = 8192
rope_parameters = {"partial_rotary_factor": rotary_dim / head_size} base = 10000
rope = get_rope(head_size, max_position, is_neox_style, rope_parameters) rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
rope = rope.to(dtype=dtype, device=device) rope = rope.to(dtype=dtype, device=device)
cos_sin_cache = rope.cos_sin_cache.to(dtype=torch.float, device=device) cos_sin_cache = rope.cos_sin_cache.to(dtype=torch.float, device=device)

View File

@@ -253,8 +253,8 @@ def benchmark(
) )
torch.cuda.synchronize() torch.cuda.synchronize()
start_event = torch.Event(enable_timing=True) start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.Event(enable_timing=True) end_event = torch.cuda.Event(enable_timing=True)
# Benchmark # Benchmark
latencies: list[float] = [] latencies: list[float] = []

View File

@@ -127,8 +127,8 @@ def benchmark_decode(
def time_fn(fn, warmup=10, trials=20): def time_fn(fn, warmup=10, trials=20):
torch.cuda.synchronize() torch.cuda.synchronize()
start = torch.Event(enable_timing=True) start = torch.cuda.Event(enable_timing=True)
end = torch.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True)
times = [] times = []
for i in range(warmup): for i in range(warmup):
fn() fn()

View File

@@ -139,8 +139,8 @@ def benchmark_prefill(
def time_fn(fn, warmup=10, trials=20): def time_fn(fn, warmup=10, trials=20):
torch.cuda.synchronize() torch.cuda.synchronize()
start = torch.Event(enable_timing=True) start = torch.cuda.Event(enable_timing=True)
end = torch.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True)
times = [] times = []
for i in range(warmup): for i in range(warmup):
fn() fn()

View File

@@ -183,8 +183,8 @@ def benchmark_config(
run() run()
torch.cuda.synchronize() torch.cuda.synchronize()
start_event = torch.Event(enable_timing=True) start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.Event(enable_timing=True) end_event = torch.cuda.Event(enable_timing=True)
latencies: list[float] = [] latencies: list[float] = []
for i in range(num_iters): for i in range(num_iters):

View File

@@ -2,7 +2,7 @@
This directory includes benchmarks between DeepSeek's DeepGEMM block fp8 kernels against vLLM's existing triton and CUTLASS-based kernels. This directory includes benchmarks between DeepSeek's DeepGEMM block fp8 kernels against vLLM's existing triton and CUTLASS-based kernels.
Currently, this just includes dense GEMMs and only works on Hopper GPUs. Currently this just includes dense GEMMs and only works on Hopper GPUs.
## Setup ## Setup

View File

@@ -55,10 +55,6 @@ output_num_chunks 166.0 99.01 11.80 79.00 90.00 98.00 108.75
---------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------
``` ```
If you run with `--warmup-step`, the summary will also include `warmup_runtime_sec`
and `total_runtime_incl_warmup_sec` (while `runtime_sec` continues to reflect the
benchmark-only runtime so the reported throughput stays comparable).
### JSON configuration file for synthetic conversations generation ### JSON configuration file for synthetic conversations generation
The input flag `--input-file` is used to determine the input conversations for the benchmark.<br/> The input flag `--input-file` is used to determine the input conversations for the benchmark.<br/>

View File

@@ -1076,7 +1076,6 @@ def process_statistics(
verbose: bool, verbose: bool,
gen_conv_args: GenConvArgs | None = None, gen_conv_args: GenConvArgs | None = None,
excel_output: bool = False, excel_output: bool = False,
warmup_runtime_sec: float | None = None,
) -> None: ) -> None:
if len(client_metrics) == 0: if len(client_metrics) == 0:
logger.info("No samples to process") logger.info("No samples to process")
@@ -1170,13 +1169,8 @@ def process_statistics(
# Convert milliseconds to seconds # Convert milliseconds to seconds
runtime_sec = runtime_sec / 1000.0 runtime_sec = runtime_sec / 1000.0
requests_per_sec = float(len(df)) / runtime_sec requests_per_sec = float(len(df)) / runtime_sec
params = {
"runtime_sec": runtime_sec, params = {"runtime_sec": runtime_sec, "requests_per_sec": requests_per_sec}
"requests_per_sec": requests_per_sec,
}
if warmup_runtime_sec is not None:
params["warmup_runtime_sec"] = warmup_runtime_sec
params["total_runtime_incl_warmup_sec"] = runtime_sec + warmup_runtime_sec
# Generate a summary of relevant metrics (and drop irrelevant data) # Generate a summary of relevant metrics (and drop irrelevant data)
df = df.drop(columns=exclude).describe(percentiles=percentiles).transpose() df = df.drop(columns=exclude).describe(percentiles=percentiles).transpose()
@@ -1558,8 +1552,6 @@ async def main() -> None:
url=args.url, num_clients=args.num_clients, early_stop=not args.no_early_stop url=args.url, num_clients=args.num_clients, early_stop=not args.no_early_stop
) )
warmup_runtime_sec: float | None = None
# Warm-up step # Warm-up step
if args.warmup_step: if args.warmup_step:
# Only send a single user prompt from every conversation. # Only send a single user prompt from every conversation.
@@ -1574,56 +1566,26 @@ async def main() -> None:
# all clients should finish their work before exiting # all clients should finish their work before exiting
warmup_bench_args = bench_args._replace(early_stop=False) warmup_bench_args = bench_args._replace(early_stop=False)
logger.info("%sWarmup start%s", Color.PURPLE, Color.RESET) logger.info(f"{Color.PURPLE}Warmup start{Color.RESET}")
warmup_start_ns = time.perf_counter_ns()
conversations, _ = await main_mp( conversations, _ = await main_mp(
warmup_client_args, req_args, warmup_bench_args, tokenizer, conversations warmup_client_args, req_args, warmup_bench_args, tokenizer, conversations
) )
warmup_runtime_sec = nanosec_to_sec(time.perf_counter_ns() - warmup_start_ns) logger.info(f"{Color.PURPLE}Warmup done{Color.RESET}")
logger.info(
"%sWarmup runtime: %.3f sec (%.3f ms)%s",
Color.PURPLE,
warmup_runtime_sec,
warmup_runtime_sec * 1000,
Color.RESET,
)
logger.info("%sWarmup done%s", Color.PURPLE, Color.RESET)
# Run the benchmark # Run the benchmark
benchmark_start_ns = time.perf_counter_ns() start_time = time.perf_counter_ns()
client_convs, client_metrics = await main_mp( client_convs, client_metrics = await main_mp(
client_args, req_args, bench_args, tokenizer, conversations client_args, req_args, bench_args, tokenizer, conversations
) )
benchmark_runtime_sec = nanosec_to_sec(time.perf_counter_ns() - benchmark_start_ns) total_runtime_ms = nanosec_to_millisec(time.perf_counter_ns() - start_time)
# Calculate requests per second # Calculate requests per second
requests_per_sec = len(client_metrics) / benchmark_runtime_sec total_runtime_sec = total_runtime_ms / 1000.0
benchmark_runtime_ms = benchmark_runtime_sec * 1000.0 rps = len(client_metrics) / total_runtime_sec
logger.info( logger.info(
"%sAll clients finished, benchmark runtime: %.3f sec (%.3f ms), " f"{Color.GREEN}All clients finished, total runtime: {total_runtime_sec:.3f} sec"
"requests per second: %.3f%s", f" ({total_runtime_ms:.3f} ms), requests per second: {rps:.3f}{Color.RESET}"
Color.GREEN,
benchmark_runtime_sec,
benchmark_runtime_ms,
requests_per_sec,
Color.RESET,
) )
if warmup_runtime_sec is not None:
total_runtime_sec = benchmark_runtime_sec + warmup_runtime_sec
logger.info(
"%sWarmup runtime: %.3f sec (%.3f ms)%s",
Color.GREEN,
warmup_runtime_sec,
warmup_runtime_sec * 1000,
Color.RESET,
)
logger.info(
"%sTotal runtime (including warmup): %.3f sec (%.3f ms)%s",
Color.GREEN,
total_runtime_sec,
total_runtime_sec * 1000,
Color.RESET,
)
# Benchmark parameters # Benchmark parameters
params = { params = {
@@ -1648,7 +1610,6 @@ async def main() -> None:
verbose=args.verbose, verbose=args.verbose,
gen_conv_args=gen_conv_args, gen_conv_args=gen_conv_args,
excel_output=args.excel_output, excel_output=args.excel_output,
warmup_runtime_sec=warmup_runtime_sec,
) )
if args.output_file is not None: if args.output_file is not None:

View File

@@ -251,6 +251,17 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
endif() endif()
# Build ACL with CMake # Build ACL with CMake
set(ARM_COMPUTE_BUILD_SHARED_LIB "OFF")
set(CMAKE_BUILD_TYPE "Release")
set(ARM_COMPUTE_ARCH "armv8.2-a")
set(ARM_COMPUTE_ENABLE_ASSERTS "OFF")
set(ARM_COMPUTE_ENABLE_CPPTHREADS "OFF")
set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
set(ARM_COMPUTE_ENABLE_OPENMP "ON")
set(ARM_COMPUTE_ENABLE_WERROR "OFF")
set(ARM_COMPUTE_BUILD_EXAMPLES "OFF")
set(ARM_COMPUTE_BUILD_TESTING "OFF")
set(_cmake_config_cmd set(_cmake_config_cmd
${CMAKE_COMMAND} -G Ninja -B build ${CMAKE_COMMAND} -G Ninja -B build
-DARM_COMPUTE_BUILD_SHARED_LIB=OFF -DARM_COMPUTE_BUILD_SHARED_LIB=OFF
@@ -364,7 +375,6 @@ set(VLLM_EXT_SRC
if (AVX512_FOUND AND NOT AVX512_DISABLED) if (AVX512_FOUND AND NOT AVX512_DISABLED)
set(VLLM_EXT_SRC set(VLLM_EXT_SRC
"csrc/cpu/shm.cpp" "csrc/cpu/shm.cpp"
"csrc/cpu/cpu_wna16.cpp"
${VLLM_EXT_SRC}) ${VLLM_EXT_SRC})
if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI) if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI)
set(VLLM_EXT_SRC set(VLLM_EXT_SRC

View File

@@ -1,53 +0,0 @@
# Install OpenAI triton_kernels from https://github.com/triton-lang/triton/tree/main/python/triton_kernels
set(DEFAULT_TRITON_KERNELS_TAG "v3.5.0")
# Set TRITON_KERNELS_SRC_DIR for use with local development with vLLM. We expect TRITON_KERNELS_SRC_DIR to
# be directly set to the triton_kernels python directory.
if (DEFINED ENV{TRITON_KERNELS_SRC_DIR})
message(STATUS "[triton_kernels] Fetch from $ENV{TRITON_KERNELS_SRC_DIR}")
FetchContent_Declare(
triton_kernels
SOURCE_DIR $ENV{TRITON_KERNELS_SRC_DIR}
)
else()
set(TRITON_GIT "https://github.com/triton-lang/triton.git")
message (STATUS "[triton_kernels] Fetch from ${TRITON_GIT}:${DEFAULT_TRITON_KERNELS_TAG}")
FetchContent_Declare(
triton_kernels
# TODO (varun) : Fetch just the triton_kernels directory from Triton
GIT_REPOSITORY https://github.com/triton-lang/triton.git
GIT_TAG ${DEFAULT_TRITON_KERNELS_TAG}
GIT_PROGRESS TRUE
SOURCE_SUBDIR python/triton_kernels/triton_kernels
)
endif()
# Fetch content
FetchContent_MakeAvailable(triton_kernels)
if (NOT triton_kernels_SOURCE_DIR)
message (FATAL_ERROR "[triton_kernels] Cannot resolve triton_kernels_SOURCE_DIR")
endif()
if (DEFINED ENV{TRITON_KERNELS_SRC_DIR})
set(TRITON_KERNELS_PYTHON_DIR "${triton_kernels_SOURCE_DIR}/")
else()
set(TRITON_KERNELS_PYTHON_DIR "${triton_kernels_SOURCE_DIR}/python/triton_kernels/triton_kernels/")
endif()
message (STATUS "[triton_kernels] triton_kernels is available at ${TRITON_KERNELS_PYTHON_DIR}")
add_custom_target(triton_kernels)
# Ensure the vllm/third_party directory exists before installation
install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/third_party/triton_kernels\")")
## Copy .py files to install directory.
install(DIRECTORY
${TRITON_KERNELS_PYTHON_DIR}
DESTINATION
vllm/third_party/triton_kernels/
COMPONENT triton_kernels
FILES_MATCHING PATTERN "*.py")

View File

@@ -38,7 +38,7 @@ else()
FetchContent_Declare( FetchContent_Declare(
vllm-flash-attn vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG 86f8f157cf82aa2342743752b97788922dd7de43 GIT_TAG 58e0626a692f09241182582659e3bf8f16472659
GIT_PROGRESS TRUE GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types # Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

View File

@@ -140,21 +140,16 @@ function(vllm_prepare_torch_gomp_shim TORCH_GOMP_SHIM_DIR)
run_python(_VLLM_TORCH_GOMP_PATH run_python(_VLLM_TORCH_GOMP_PATH
" "
import os, glob import os, glob
import torch try:
torch_pkg = os.path.dirname(torch.__file__) import torch
site_root = os.path.dirname(torch_pkg) torch_pkg = os.path.dirname(torch.__file__)
site_root = os.path.dirname(torch_pkg)
# Search both torch.libs and torch/lib torch_libs = os.path.join(site_root, 'torch.libs')
roots = [os.path.join(site_root, 'torch.libs'), os.path.join(torch_pkg, 'lib')] print(glob.glob(os.path.join(torch_libs, 'libgomp-*.so*'))[0])
candidates = [] except:
for root in roots: print('')
if not os.path.isdir(root):
continue
candidates.extend(glob.glob(os.path.join(root, 'libgomp*.so*')))
print(candidates[0] if candidates else '')
" "
"failed to probe for libgomp") "failed to probe torch.libs for libgomp")
if(_VLLM_TORCH_GOMP_PATH STREQUAL "" OR NOT EXISTS "${_VLLM_TORCH_GOMP_PATH}") if(_VLLM_TORCH_GOMP_PATH STREQUAL "" OR NOT EXISTS "${_VLLM_TORCH_GOMP_PATH}")
return() return()
@@ -500,13 +495,7 @@ function (define_extension_target MOD_NAME)
set(SOABI_KEYWORD "") set(SOABI_KEYWORD "")
endif() endif()
run_python(IS_FREETHREADED_PYTHON if (ARG_USE_SABI)
"import sysconfig; print(1 if sysconfig.get_config_var(\"Py_GIL_DISABLED\") else 0)"
"Failed to determine whether interpreter is free-threaded")
# Free-threaded Python doesn't yet support the stable ABI (see PEP 803/809),
# so avoid using the stable ABI under free-threading only.
if (ARG_USE_SABI AND NOT IS_FREETHREADED_PYTHON)
Python_add_library(${MOD_NAME} MODULE USE_SABI ${ARG_USE_SABI} ${SOABI_KEYWORD} "${ARG_SOURCES}") Python_add_library(${MOD_NAME} MODULE USE_SABI ${ARG_USE_SABI} ${SOABI_KEYWORD} "${ARG_SOURCES}")
else() else()
Python_add_library(${MOD_NAME} MODULE ${SOABI_KEYWORD} "${ARG_SOURCES}") Python_add_library(${MOD_NAME} MODULE ${SOABI_KEYWORD} "${ARG_SOURCES}")

View File

@@ -16,8 +16,7 @@ __global__ void merge_attn_states_kernel(
scalar_t* output, float* output_lse, const scalar_t* prefix_output, scalar_t* output, float* output_lse, const scalar_t* prefix_output,
const float* prefix_lse, const scalar_t* suffix_output, const float* prefix_lse, const scalar_t* suffix_output,
const float* suffix_lse, const uint num_tokens, const uint num_heads, const float* suffix_lse, const uint num_tokens, const uint num_heads,
const uint head_size, const uint prefix_head_stride, const uint head_size) {
const uint output_head_stride) {
using pack_128b_t = uint4; using pack_128b_t = uint4;
const uint pack_size = 16 / sizeof(scalar_t); const uint pack_size = 16 / sizeof(scalar_t);
const uint threads_per_head = head_size / pack_size; const uint threads_per_head = head_size / pack_size;
@@ -35,13 +34,11 @@ __global__ void merge_attn_states_kernel(
const uint head_idx = token_head_idx % num_heads; const uint head_idx = token_head_idx % num_heads;
const uint pack_offset = pack_idx * pack_size; // (0~15)*8, etc. const uint pack_offset = pack_idx * pack_size; // (0~15)*8, etc.
const uint src_head_offset = token_idx * num_heads * prefix_head_stride + const uint head_offset =
head_idx * prefix_head_stride; token_idx * num_heads * head_size + head_idx * head_size;
const uint dst_head_offset = token_idx * num_heads * output_head_stride + const scalar_t* prefix_head_ptr = prefix_output + head_offset;
head_idx * output_head_stride; const scalar_t* suffix_head_ptr = suffix_output + head_offset;
const scalar_t* prefix_head_ptr = prefix_output + src_head_offset; scalar_t* output_head_ptr = output + head_offset;
const scalar_t* suffix_head_ptr = suffix_output + src_head_offset;
scalar_t* output_head_ptr = output + dst_head_offset;
float p_lse = prefix_lse[head_idx * num_tokens + token_idx]; float p_lse = prefix_lse[head_idx * num_tokens + token_idx];
float s_lse = suffix_lse[head_idx * num_tokens + token_idx]; float s_lse = suffix_lse[head_idx * num_tokens + token_idx];
@@ -143,7 +140,7 @@ __global__ void merge_attn_states_kernel(
reinterpret_cast<float*>(prefix_lse.data_ptr()), \ reinterpret_cast<float*>(prefix_lse.data_ptr()), \
reinterpret_cast<scalar_t*>(suffix_output.data_ptr()), \ reinterpret_cast<scalar_t*>(suffix_output.data_ptr()), \
reinterpret_cast<float*>(suffix_lse.data_ptr()), num_tokens, \ reinterpret_cast<float*>(suffix_lse.data_ptr()), num_tokens, \
num_heads, head_size, prefix_head_stride, output_head_stride); \ num_heads, head_size); \
} }
/*@brief Merges the attention states from prefix and suffix /*@brief Merges the attention states from prefix and suffix
@@ -169,11 +166,17 @@ void merge_attn_states_launcher(torch::Tensor& output,
const uint num_tokens = output.size(0); const uint num_tokens = output.size(0);
const uint num_heads = output.size(1); const uint num_heads = output.size(1);
const uint head_size = output.size(2); const uint head_size = output.size(2);
const uint prefix_head_stride = prefix_output.stride(1);
const uint output_head_stride = output.stride(1);
const uint pack_size = 16 / sizeof(scalar_t); const uint pack_size = 16 / sizeof(scalar_t);
TORCH_CHECK(head_size % pack_size == 0, TORCH_CHECK(head_size % pack_size == 0,
"headsize must be multiple of pack_size:", pack_size); "headsize must be multiple of pack_size:", pack_size);
TORCH_CHECK(output.stride(-2) == head_size && output.stride(-1) == 1,
"output heads must be contiguous in memory");
TORCH_CHECK(
prefix_output.stride(-2) == head_size && prefix_output.stride(-1) == 1,
"prefix_output heads must be contiguous in memory");
TORCH_CHECK(
suffix_output.stride(-2) == head_size && suffix_output.stride(-1) == 1,
"suffix_output heads must be contiguous in memory");
float* output_lse_ptr = nullptr; float* output_lse_ptr = nullptr;
if (output_lse.has_value()) { if (output_lse.has_value()) {
output_lse_ptr = output_lse.value().data_ptr<float>(); output_lse_ptr = output_lse.value().data_ptr<float>();

View File

@@ -1,7 +1,6 @@
#pragma once #pragma once
#include <torch/all.h> #include <torch/all.h>
#include <c10/util/Optional.h>
#include <map> #include <map>
#include <vector> #include <vector>
@@ -42,12 +41,11 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
const double scale, const std::string& kv_cache_dtype); const double scale, const std::string& kv_cache_dtype);
void gather_and_maybe_dequant_cache( void gather_and_maybe_dequant_cache(
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...] torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...] torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES] torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
torch::Tensor const& cu_seq_lens, // [BATCH+1] torch::Tensor const& cu_seq_lens, // [BATCH+1]
torch::Tensor const& token_to_seq, // [MAX_TOKEN_ACROSS_CHUNKS] int64_t batch_size, const std::string& kv_cache_dtype,
int64_t num_tokens, const std::string& kv_cache_dtype,
torch::Tensor const& scale, torch::Tensor const& scale,
std::optional<torch::Tensor> seq_starts = std::nullopt); std::optional<torch::Tensor> seq_starts = std::nullopt);
@@ -59,15 +57,6 @@ void cp_gather_cache(
torch::Tensor const& cu_seq_lens, // [BATCH+1] torch::Tensor const& cu_seq_lens, // [BATCH+1]
int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt); int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
// Gather and upconvert FP8 KV cache to BF16 workspace
void cp_gather_and_upconvert_fp8_kv_cache(
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, 656]
torch::Tensor const& dst, // [TOT_TOKENS, 576]
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
torch::Tensor const& seq_lens, // [BATCH]
torch::Tensor const& workspace_starts, // [BATCH]
int64_t batch_size);
// Indexer K quantization and cache function // Indexer K quantization and cache function
void indexer_k_quant_and_cache( void indexer_k_quant_and_cache(
torch::Tensor& k, // [num_tokens, head_dim] torch::Tensor& k, // [num_tokens, head_dim]

View File

@@ -2,7 +2,6 @@
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h> #include <c10/cuda/CUDAGuard.h>
#include <c10/cuda/CUDAException.h> #include <c10/cuda/CUDAException.h>
#include <c10/util/Optional.h>
#include "cuda_utils.h" #include "cuda_utils.h"
#include "cuda_compat.h" #include "cuda_compat.h"
@@ -515,8 +514,7 @@ __global__ void indexer_k_quant_and_cache_kernel(
const int quant_block_size, // quantization block size const int quant_block_size, // quantization block size
const int cache_block_size, // cache block size const int cache_block_size, // cache block size
const int cache_stride, // stride for each token in kv_cache const int cache_stride, // stride for each token in kv_cache
const bool use_ue8m0 // use ue8m0 scale format
const bool use_ue8m0 // use ue8m0 scale format
) { ) {
constexpr int VEC_SIZE = 4; constexpr int VEC_SIZE = 4;
const int64_t token_idx = blockIdx.x; const int64_t token_idx = blockIdx.x;
@@ -554,11 +552,7 @@ __global__ void indexer_k_quant_and_cache_kernel(
#ifndef USE_ROCM #ifndef USE_ROCM
__syncwarp(); __syncwarp();
#endif #endif
#if defined(__gfx942__)
float scale = fmaxf(amax, 1e-4) / 224.0f;
#else
float scale = fmaxf(amax, 1e-4) / 448.0f; float scale = fmaxf(amax, 1e-4) / 448.0f;
#endif
if (use_ue8m0) { if (use_ue8m0) {
scale = exp2f(ceilf(log2f(scale))); scale = exp2f(ceilf(log2f(scale)));
} }
@@ -907,80 +901,87 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
namespace vllm { namespace vllm {
// grid is launched with dimensions (batch, num_splits) // grid is launched with dimensions (batch, num_splits)
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt, template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
int ENTRY_SIZE, int CTA_SIZE>
__global__ void gather_and_maybe_dequant_cache( __global__ void gather_and_maybe_dequant_cache(
const cache_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE, const cache_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE,
// ENTRIES...] // ENTRIES...]
scalar_t* __restrict__ dst, // [TOT_TOKENS, ENTRIES...] scalar_t* __restrict__ dst, // [TOT_TOKENS, ENTRIES...]
const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES] const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES]
const int32_t* __restrict__ cu_seq_lens, // [BATCH+1] const int32_t* __restrict__ cu_seq_lens, // [BATCH+1]
const int32_t* __restrict__ token_to_seq, // [MAX_TOKEN_ACROSS_CHUNK] const int32_t block_size, const int32_t entry_size,
const int32_t num_tokens, const int32_t block_size,
const int64_t block_table_stride, const int64_t cache_block_stride, const int64_t block_table_stride, const int64_t cache_block_stride,
const int64_t cache_entry_stride, const int64_t dst_entry_stride, const int64_t cache_entry_stride, const int64_t dst_entry_stride,
const float* __restrict__ scale, const float* __restrict__ scale,
const int32_t* __restrict__ seq_starts) { // Optional: starting offsets per const int32_t* __restrict__ seq_starts) { // Optional: starting offsets per
// batch // batch
constexpr int vec_size = sizeof(float4) / sizeof(scalar_t);
using ltype = vllm::vec_n_t<cache_t, vec_size>;
using stype = vllm::vec_n_t<scalar_t, vec_size>;
// We are adding this for code readability which will be optimized out when
// build in release.
assert(CTA_SIZE == blockDim.x);
#pragma unroll const int64_t bid = blockIdx.x; // Batch ID
for (int token_id = blockIdx.x; token_id < num_tokens; const int32_t num_splits = gridDim.y;
token_id += gridDim.x) { const int32_t split = blockIdx.y;
int64_t batch_id = token_to_seq[token_id]; const int32_t seq_start = cu_seq_lens[bid];
int64_t batch_start = cu_seq_lens[batch_id]; const int32_t seq_end = cu_seq_lens[bid + 1];
int64_t batch_end = cu_seq_lens[batch_id + 1]; const int32_t seq_len = seq_end - seq_start;
int32_t batch_offset = token_id - batch_start; const int32_t tot_blocks = cuda_utils::ceil_div(seq_len, block_size);
const int32_t split_blocks = cuda_utils::ceil_div(tot_blocks, num_splits);
if (token_id >= batch_end) return; const int32_t split_start = split * split_blocks;
int32_t offset = 0; const int32_t split_end = min((split + 1) * split_blocks, tot_blocks);
if (seq_starts != nullptr) {
offset = seq_starts[batch_id];
}
batch_offset += offset;
int32_t block_table_id = batch_offset / block_size;
int32_t slot_id = batch_offset % block_size;
int32_t block_table_offset = batch_id * block_table_stride + block_table_id;
int32_t block_id = block_table[block_table_offset];
int64_t cache_offset =
block_id * cache_block_stride + slot_id * cache_entry_stride;
constexpr int32_t vec_iter_cnt = ENTRY_SIZE / vec_size;
scalar_t* dst_ = dst + token_id * dst_entry_stride;
cache_t* src_ = const_cast<cache_t*>(src_cache) + cache_offset;
#pragma unroll const bool is_active_split = (split_start < tot_blocks);
for (int idx = threadIdx.x; idx < vec_iter_cnt; idx += CTA_SIZE) { const bool is_last_split = (split_end == tot_blocks);
if (!is_active_split) return;
int32_t full_blocks_end = split_end;
int32_t partial_block_size = 0;
// Adjust the pointer for the block_table for this batch.
// If seq_starts is provided, compute an offset based on (seq_starts[bid] /
// page_size)
const int32_t batch_offset = bid * block_table_stride;
int32_t offset = 0;
if (seq_starts != nullptr) {
offset = seq_starts[bid] / block_size;
}
const int32_t* batch_block_table = block_table + batch_offset + offset;
// Adjust dst pointer based on the cumulative sequence lengths.
dst += seq_start * dst_entry_stride;
if (is_last_split) {
partial_block_size = seq_len % block_size;
if (partial_block_size) full_blocks_end -= 1;
}
auto copy_entry = [&](const cache_t* __restrict__ _src,
scalar_t* __restrict__ _dst) {
for (int i = threadIdx.x; i < entry_size; i += blockDim.x) {
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) { if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
reinterpret_cast<stype*>(dst_)[idx] = _dst[i] = static_cast<scalar_t>(_src[i]);
static_cast<stype>(reinterpret_cast<ltype*>(src_)[idx]);
} else { } else {
ltype loaded_val = reinterpret_cast<ltype*>(src_)[idx]; _dst[i] =
stype store_val; fp8::scaled_convert<scalar_t, cache_t, kv_dt>(_src[i], *scale);
#pragma unroll
for (int j = 0; j < vec_size; ++j) {
store_val.val[j] = fp8::scaled_convert<scalar_t, cache_t, kv_dt>(
loaded_val.val[j], *scale);
}
reinterpret_cast<stype*>(dst_)[idx] = store_val;
} }
} }
// process tail };
constexpr int32_t tail_cnt = ENTRY_SIZE % vec_size;
dst_ = dst_ + ENTRY_SIZE - tail_cnt; for (int pid = split_start; pid < full_blocks_end; ++pid) {
src_ = src_ + ENTRY_SIZE - tail_cnt; auto block_id = batch_block_table[pid];
#pragma unroll auto block_start_ptr = src_cache + block_id * cache_block_stride;
for (int idx = threadIdx.x; idx < tail_cnt; idx += CTA_SIZE) { auto block_dst_ptr = dst + pid * block_size * dst_entry_stride;
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) { for (int eid = 0; eid < block_size; ++eid) {
dst_[idx] = static_cast<scalar_t>(src_[idx]); copy_entry(block_start_ptr + eid * cache_entry_stride,
} else { block_dst_ptr + eid * dst_entry_stride);
dst_[idx] = }
fp8::scaled_convert<scalar_t, cache_t, kv_dt>(src_[idx], *scale); }
}
if (partial_block_size) {
auto block_id = batch_block_table[full_blocks_end];
auto block_start_ptr = src_cache + block_id * cache_block_stride;
auto block_dst_ptr = dst + full_blocks_end * block_size * dst_entry_stride;
for (int eid = 0; eid < partial_block_size; ++eid) {
copy_entry(block_start_ptr + eid * cache_entry_stride,
block_dst_ptr + eid * dst_entry_stride);
} }
} }
} }
@@ -991,38 +992,34 @@ __global__ void gather_and_maybe_dequant_cache(
// SCALAR_T is the data type of the destination tensor. // SCALAR_T is the data type of the destination tensor.
// CACHE_T is the stored data type of kv-cache. // CACHE_T is the stored data type of kv-cache.
// KV_DTYPE is the real data type of kv-cache. // KV_DTYPE is the real data type of kv-cache.
#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE) \ #define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE) \
vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE, 576, \ vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE> \
thread_block_size> \ <<<grid, block, 0, stream>>>( \
<<<grid, block, 0, stream>>>( \ reinterpret_cast<CACHE_T*>(src_cache.data_ptr()), \
reinterpret_cast<CACHE_T*>(src_cache.data_ptr()), \ reinterpret_cast<SCALAR_T*>(dst.data_ptr()), \
reinterpret_cast<SCALAR_T*>(dst.data_ptr()), \ block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \ block_size, entry_size, block_table_stride, cache_block_stride, \
token_to_seq.data_ptr<int32_t>(), num_tokens, block_size, \ cache_entry_stride, dst_entry_stride, \
block_table_stride, cache_block_stride, cache_entry_stride, \ reinterpret_cast<const float*>(scale.data_ptr()), seq_starts_ptr);
dst_entry_stride, reinterpret_cast<const float*>(scale.data_ptr()), \
seq_starts_ptr);
// Gather sequences from the cache into the destination tensor. // Gather sequences from the cache into the destination tensor.
// - cu_seq_lens contains the cumulative sequence lengths for each batch // - cu_seq_lens contains the cumulative sequence lengths for each batch
// - block_table contains the cache block indices for each sequence // - block_table contains the cache block indices for each sequence
// - token_to_seq contains the back mapping from token_id to batch_id
// - Optionally, seq_starts (if provided) offsets the starting block index by // - Optionally, seq_starts (if provided) offsets the starting block index by
// (seq_starts[bid] / page_size) // (seq_starts[bid] / page_size)
void gather_and_maybe_dequant_cache( void gather_and_maybe_dequant_cache(
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...] torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...] torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES] torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
torch::Tensor const& cu_seq_lens, // [BATCH+1] torch::Tensor const& cu_seq_lens, // [BATCH+1]
torch::Tensor const& token_to_seq, // [MAX_TOKEN_ACROSS_CHUNKS] int64_t batch_size, const std::string& kv_cache_dtype,
int64_t num_tokens, const std::string& kv_cache_dtype,
torch::Tensor const& scale, torch::Tensor const& scale,
std::optional<torch::Tensor> seq_starts = std::nullopt) { std::optional<torch::Tensor> seq_starts = std::nullopt) {
at::cuda::OptionalCUDAGuard device_guard(src_cache.device()); at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
int32_t block_size = src_cache.size(1); int32_t block_size = src_cache.size(1);
int32_t head_dim = dst.size(-1); int32_t entry_size = src_cache.flatten(2, -1).size(2);
TORCH_CHECK(block_table.dtype() == torch::kInt32, TORCH_CHECK(block_table.dtype() == torch::kInt32,
"block_table must be int32"); "block_table must be int32");
@@ -1032,9 +1029,6 @@ void gather_and_maybe_dequant_cache(
TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32, TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
"seq_starts must be int32"); "seq_starts must be int32");
} }
TORCH_CHECK(head_dim == 576,
"gather_and_maybe_dequant_cache only support the head_dim to 576 "
"for better performance")
TORCH_CHECK(src_cache.device() == dst.device(), TORCH_CHECK(src_cache.device() == dst.device(),
"src_cache and dst must be on the same device"); "src_cache and dst must be on the same device");
@@ -1052,9 +1046,10 @@ void gather_and_maybe_dequant_cache(
int64_t cache_entry_stride = src_cache.stride(1); int64_t cache_entry_stride = src_cache.stride(1);
int64_t dst_entry_stride = dst.stride(0); int64_t dst_entry_stride = dst.stride(0);
constexpr int32_t thread_block_size = 64; // Decide on the number of splits based on the batch size.
dim3 grid(num_tokens); int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
dim3 block(thread_block_size); dim3 grid(batch_size, num_splits);
dim3 block(1024);
const int32_t* seq_starts_ptr = const int32_t* seq_starts_ptr =
seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr; seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;
@@ -1063,82 +1058,6 @@ void gather_and_maybe_dequant_cache(
} }
namespace vllm { namespace vllm {
// Gather and upconvert FP8 KV cache tokens to BF16 workspace
// Similar to cp_gather_cache but specifically for FP8->BF16 conversion
__global__ void cp_gather_and_upconvert_fp8_kv_cache(
const uint8_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE, 656]
__nv_bfloat16* __restrict__ dst, // [TOT_TOKENS, 576]
const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES]
const int32_t* __restrict__ seq_lens, // [BATCH]
const int32_t* __restrict__ workspace_starts, // [BATCH]
const int32_t block_size, const int32_t head_dim,
const int64_t block_table_stride, const int64_t cache_block_stride,
const int64_t cache_entry_stride, const int64_t dst_entry_stride) {
const int64_t bid = blockIdx.x; // Batch ID
const int32_t num_splits = gridDim.y;
const int32_t split = blockIdx.y;
const int32_t seq_start = workspace_starts[bid];
const int32_t seq_len = seq_lens[bid];
const int32_t tot_slots = seq_len;
const int32_t split_slots = cuda_utils::ceil_div(tot_slots, num_splits);
const int32_t split_start = split * split_slots;
const int32_t split_end = min((split + 1) * split_slots, tot_slots);
const bool is_active_split = (split_start < tot_slots);
if (!is_active_split) return;
// Adjust the pointer for the block_table for this batch
const int32_t batch_offset = bid * block_table_stride;
int32_t offset = split_start;
int32_t offset_div = offset / block_size;
offset = offset % block_size;
const int32_t* batch_block_table = block_table + batch_offset;
// Adjust dst pointer based on the cumulative sequence lengths
dst += seq_start * dst_entry_stride;
const int tid = threadIdx.x;
// Process each token in this split
for (int pid = split_start; pid < split_end; ++pid) {
auto block_id = batch_block_table[offset_div];
const uint8_t* token_ptr =
src_cache + block_id * cache_block_stride + offset * cache_entry_stride;
__nv_bfloat16* dst_ptr = dst + pid * dst_entry_stride;
// FP8 format: 512 bytes fp8 + 16 bytes scales + 128 bytes rope (64 bf16)
const uint8_t* no_pe_ptr = token_ptr;
const float* scales_ptr = reinterpret_cast<const float*>(token_ptr + 512);
const __nv_bfloat16* rope_ptr =
reinterpret_cast<const __nv_bfloat16*>(token_ptr + 512 + 16);
// Parallelize fp8 dequant (512 elements) and rope copy (64 elements)
if (tid < 512) {
// FP8 dequantization
const int tile = tid >> 7; // each tile is 128 elements
const float scale = scales_ptr[tile];
const uint8_t val = no_pe_ptr[tid];
dst_ptr[tid] =
fp8::scaled_convert<__nv_bfloat16, uint8_t,
vllm::Fp8KVCacheDataType::kFp8E4M3>(val, scale);
} else if (tid < 576) {
// Rope copy (64 bf16 elements)
const int rope_idx = tid - 512;
dst_ptr[512 + rope_idx] = rope_ptr[rope_idx];
}
// Move to next token
offset += 1;
if (offset == block_size) {
offset_div += 1;
offset = 0;
}
}
}
template <typename scalar_t> template <typename scalar_t>
// Note(hc): The cp_gather_cache allows seq_starts to no longer be divisible by // Note(hc): The cp_gather_cache allows seq_starts to no longer be divisible by
// block_size. // block_size.
@@ -1280,57 +1199,6 @@ void cp_gather_cache(
} }
} }
void cp_gather_and_upconvert_fp8_kv_cache(
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, 656]
torch::Tensor const& dst, // [TOT_TOKENS, 576]
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
torch::Tensor const& seq_lens, // [BATCH]
torch::Tensor const& workspace_starts, // [BATCH]
int64_t batch_size) {
at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
int32_t block_size = src_cache.size(1);
int32_t head_dim = dst.size(1);
TORCH_CHECK(block_table.dtype() == torch::kInt32,
"block_table must be int32");
TORCH_CHECK(seq_lens.dtype() == torch::kInt32, "seq_lens must be int32");
TORCH_CHECK(workspace_starts.dtype() == torch::kInt32,
"workspace_starts must be int32");
TORCH_CHECK(src_cache.device() == dst.device(),
"src_cache and dst must be on the same device");
TORCH_CHECK(src_cache.device() == block_table.device(),
"src_cache and block_table must be on the same device");
TORCH_CHECK(src_cache.device() == seq_lens.device(),
"src_cache and seq_lens must be on the same device");
TORCH_CHECK(src_cache.device() == workspace_starts.device(),
"src_cache and workspace_starts must be on the same device");
TORCH_CHECK(src_cache.dtype() == torch::kUInt8, "src_cache must be uint8");
TORCH_CHECK(dst.dtype() == torch::kBFloat16, "dst must be bfloat16");
TORCH_CHECK(head_dim == 576, "head_dim must be 576 for MLA");
int64_t block_table_stride = block_table.stride(0);
int64_t cache_block_stride = src_cache.stride(0);
int64_t cache_entry_stride = src_cache.stride(1);
int64_t dst_entry_stride = dst.stride(0);
// Decide on the number of splits based on the batch size
int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
dim3 grid(batch_size, num_splits);
dim3 block(576);
vllm::cp_gather_and_upconvert_fp8_kv_cache<<<grid, block, 0, stream>>>(
src_cache.data_ptr<uint8_t>(),
reinterpret_cast<__nv_bfloat16*>(dst.data_ptr()),
block_table.data_ptr<int32_t>(), seq_lens.data_ptr<int32_t>(),
workspace_starts.data_ptr<int32_t>(), block_size, head_dim,
block_table_stride, cache_block_stride, cache_entry_stride,
dst_entry_stride);
}
// Macro to dispatch the kernel based on the data type. // Macro to dispatch the kernel based on the data type.
#define CALL_INDEXER_K_QUANT_AND_CACHE(KV_T, CACHE_T, KV_DTYPE) \ #define CALL_INDEXER_K_QUANT_AND_CACHE(KV_T, CACHE_T, KV_DTYPE) \
vllm::indexer_k_quant_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE> \ vllm::indexer_k_quant_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE> \

Some files were not shown because too many files have changed in this diff Show More