Compare commits
4 Commits
v0.13.0rc3
...
v0.11.2
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
275de34170 | ||
|
|
fa3ffb4365 | ||
|
|
6d5974369c | ||
|
|
0ce9990d2c |
@@ -1,24 +0,0 @@
|
|||||||
name: vllm_ci
|
|
||||||
job_dirs:
|
|
||||||
- ".buildkite/test_areas"
|
|
||||||
- ".buildkite/image_build"
|
|
||||||
run_all_patterns:
|
|
||||||
- "docker/Dockerfile"
|
|
||||||
- "CMakeLists.txt"
|
|
||||||
- "requirements/common.txt"
|
|
||||||
- "requirements/cuda.txt"
|
|
||||||
- "requirements/build.txt"
|
|
||||||
- "requirements/test.txt"
|
|
||||||
- "setup.py"
|
|
||||||
- "csrc/"
|
|
||||||
- "cmake/"
|
|
||||||
run_all_exclude_patterns:
|
|
||||||
- "docker/Dockerfile."
|
|
||||||
- "csrc/cpu/"
|
|
||||||
- "csrc/rocm/"
|
|
||||||
- "cmake/hipify.py"
|
|
||||||
- "cmake/cpu_extension.cmake"
|
|
||||||
registries: public.ecr.aws/q9t5s3a7
|
|
||||||
repositories:
|
|
||||||
main: "vllm-ci-postmerge-repo"
|
|
||||||
premerge: "vllm-ci-test-repo"
|
|
||||||
46
.buildkite/generate_index.py
Normal file
46
.buildkite/generate_index.py
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
|
||||||
|
template = """<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<h1>Links for vLLM</h1/>
|
||||||
|
<a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
|
||||||
|
<a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--wheel", help="The wheel path.", required=True)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
filename = os.path.basename(args.wheel)
|
||||||
|
|
||||||
|
with open("index.html", "w") as f:
|
||||||
|
print(f"Generated index.html for {args.wheel}")
|
||||||
|
# sync the abi tag with .buildkite/scripts/upload-wheels.sh
|
||||||
|
if "x86_64" in filename:
|
||||||
|
x86_wheel = filename
|
||||||
|
arm_wheel = filename.replace("x86_64", "aarch64").replace(
|
||||||
|
"manylinux1", "manylinux2014"
|
||||||
|
)
|
||||||
|
elif "aarch64" in filename:
|
||||||
|
x86_wheel = filename.replace("aarch64", "x86_64").replace(
|
||||||
|
"manylinux2014", "manylinux1"
|
||||||
|
)
|
||||||
|
arm_wheel = filename
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported wheel: {filename}")
|
||||||
|
# cloudfront requires escaping the '+' character
|
||||||
|
f.write(
|
||||||
|
template.format(
|
||||||
|
x86_wheel=x86_wheel,
|
||||||
|
x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
|
||||||
|
arm_wheel=arm_wheel,
|
||||||
|
arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
|
||||||
|
)
|
||||||
|
)
|
||||||
@@ -1,56 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
if [[ $# -lt 8 ]]; then
|
|
||||||
echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
REGISTRY=$1
|
|
||||||
REPO=$2
|
|
||||||
BUILDKITE_COMMIT=$3
|
|
||||||
BRANCH=$4
|
|
||||||
VLLM_USE_PRECOMPILED=$5
|
|
||||||
VLLM_MERGE_BASE_COMMIT=$6
|
|
||||||
CACHE_FROM=$7
|
|
||||||
CACHE_TO=$8
|
|
||||||
|
|
||||||
# authenticate with AWS ECR
|
|
||||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
|
||||||
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
|
|
||||||
|
|
||||||
# docker buildx
|
|
||||||
docker buildx create --name vllm-builder --driver docker-container --use
|
|
||||||
docker buildx inspect --bootstrap
|
|
||||||
docker buildx ls
|
|
||||||
|
|
||||||
# skip build if image already exists
|
|
||||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
|
|
||||||
echo "Image not found, proceeding with build..."
|
|
||||||
else
|
|
||||||
echo "Image found"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
|
|
||||||
merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
|
|
||||||
else
|
|
||||||
merge_base_commit_build_args=""
|
|
||||||
fi
|
|
||||||
|
|
||||||
# build
|
|
||||||
docker buildx build --file docker/Dockerfile \
|
|
||||||
--build-arg max_jobs=16 \
|
|
||||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
|
||||||
--build-arg USE_SCCACHE=1 \
|
|
||||||
--build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
|
|
||||||
--build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
|
|
||||||
--build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
|
|
||||||
${merge_base_commit_build_args} \
|
|
||||||
--cache-from type=registry,ref=${CACHE_FROM},mode=max \
|
|
||||||
--cache-to type=registry,ref=${CACHE_TO},mode=max \
|
|
||||||
--tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
|
|
||||||
$( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
|
|
||||||
--push \
|
|
||||||
--target test \
|
|
||||||
--progress plain .
|
|
||||||
@@ -1,57 +0,0 @@
|
|||||||
group: Abuild
|
|
||||||
steps:
|
|
||||||
- label: ":docker: Build image"
|
|
||||||
key: image-build
|
|
||||||
depends_on: []
|
|
||||||
commands:
|
|
||||||
- .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
|
|
||||||
retry:
|
|
||||||
automatic:
|
|
||||||
- exit_status: -1 # Agent was lost
|
|
||||||
limit: 2
|
|
||||||
- exit_status: -10 # Agent was lost
|
|
||||||
limit: 2
|
|
||||||
|
|
||||||
- label: ":docker: Build CPU image"
|
|
||||||
key: image-build-cpu
|
|
||||||
depends_on: []
|
|
||||||
commands:
|
|
||||||
- .buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
retry:
|
|
||||||
automatic:
|
|
||||||
- exit_status: -1 # Agent was lost
|
|
||||||
limit: 2
|
|
||||||
- exit_status: -10 # Agent was lost
|
|
||||||
limit: 2
|
|
||||||
|
|
||||||
- label: ":docker: Build HPU image"
|
|
||||||
soft_fail: true
|
|
||||||
depends_on: []
|
|
||||||
key: image-build-hpu
|
|
||||||
commands:
|
|
||||||
- .buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
retry:
|
|
||||||
automatic:
|
|
||||||
- exit_status: -1 # Agent was lost
|
|
||||||
limit: 2
|
|
||||||
- exit_status: -10 # Agent was lost
|
|
||||||
limit: 2
|
|
||||||
|
|
||||||
- label: ":docker: Build CPU arm64 image"
|
|
||||||
key: cpu-arm64-image-build
|
|
||||||
depends_on: []
|
|
||||||
optional: true
|
|
||||||
commands:
|
|
||||||
- .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
retry:
|
|
||||||
automatic:
|
|
||||||
- exit_status: -1 # Agent was lost
|
|
||||||
limit: 2
|
|
||||||
- exit_status: -10 # Agent was lost
|
|
||||||
limit: 2
|
|
||||||
@@ -1,36 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
if [[ $# -lt 3 ]]; then
|
|
||||||
echo "Usage: $0 <registry> <repo> <commit>"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
REGISTRY=$1
|
|
||||||
REPO=$2
|
|
||||||
BUILDKITE_COMMIT=$3
|
|
||||||
|
|
||||||
# authenticate with AWS ECR
|
|
||||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
|
||||||
|
|
||||||
# skip build if image already exists
|
|
||||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
|
|
||||||
echo "Image not found, proceeding with build..."
|
|
||||||
else
|
|
||||||
echo "Image found"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
# build
|
|
||||||
docker build --file docker/Dockerfile.cpu \
|
|
||||||
--build-arg max_jobs=16 \
|
|
||||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
|
||||||
--build-arg VLLM_CPU_AVX512BF16=true \
|
|
||||||
--build-arg VLLM_CPU_AVX512VNNI=true \
|
|
||||||
--build-arg VLLM_CPU_AMXBF16=true \
|
|
||||||
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
|
|
||||||
--target vllm-test \
|
|
||||||
--progress plain .
|
|
||||||
|
|
||||||
# push
|
|
||||||
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
|
|
||||||
@@ -1,33 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
if [[ $# -lt 3 ]]; then
|
|
||||||
echo "Usage: $0 <registry> <repo> <commit>"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
REGISTRY=$1
|
|
||||||
REPO=$2
|
|
||||||
BUILDKITE_COMMIT=$3
|
|
||||||
|
|
||||||
# authenticate with AWS ECR
|
|
||||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
|
||||||
|
|
||||||
# skip build if image already exists
|
|
||||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
|
|
||||||
echo "Image not found, proceeding with build..."
|
|
||||||
else
|
|
||||||
echo "Image found"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
# build
|
|
||||||
docker build --file docker/Dockerfile.cpu \
|
|
||||||
--build-arg max_jobs=16 \
|
|
||||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
|
||||||
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
|
|
||||||
--target vllm-test \
|
|
||||||
--progress plain .
|
|
||||||
|
|
||||||
# push
|
|
||||||
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
|
|
||||||
@@ -1,34 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
if [[ $# -lt 3 ]]; then
|
|
||||||
echo "Usage: $0 <registry> <repo> <commit>"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
REGISTRY=$1
|
|
||||||
REPO=$2
|
|
||||||
BUILDKITE_COMMIT=$3
|
|
||||||
|
|
||||||
# authenticate with AWS ECR
|
|
||||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
|
||||||
|
|
||||||
# skip build if image already exists
|
|
||||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
|
|
||||||
echo "Image not found, proceeding with build..."
|
|
||||||
else
|
|
||||||
echo "Image found"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
# build
|
|
||||||
docker build \
|
|
||||||
--file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
|
|
||||||
--build-arg max_jobs=16 \
|
|
||||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
|
||||||
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
|
|
||||||
--progress plain \
|
|
||||||
https://github.com/vllm-project/vllm-gaudi.git
|
|
||||||
|
|
||||||
# push
|
|
||||||
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
|
|
||||||
@@ -8,4 +8,3 @@ tasks:
|
|||||||
value: 0.80
|
value: 0.80
|
||||||
limit: 250 # will run on 250 * 14 subjects = 3500 samples
|
limit: 250 # will run on 250 * 14 subjects = 3500 samples
|
||||||
num_fewshot: 5
|
num_fewshot: 5
|
||||||
rtol: 0.05
|
|
||||||
|
|||||||
@@ -1 +0,0 @@
|
|||||||
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
|
|
||||||
@@ -9,40 +9,11 @@ pytest -s -v test_lm_eval_correctness.py \
|
|||||||
--tp-size=1
|
--tp-size=1
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
|
||||||
from contextlib import contextmanager
|
|
||||||
|
|
||||||
import lm_eval
|
import lm_eval
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
DEFAULT_RTOL = 0.08
|
RTOL = 0.08
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
|
||||||
def scoped_env_vars(new_env: dict[str, str]):
|
|
||||||
if not new_env:
|
|
||||||
# Fast path: nothing to do
|
|
||||||
yield
|
|
||||||
return
|
|
||||||
|
|
||||||
old_values = {}
|
|
||||||
new_keys = []
|
|
||||||
|
|
||||||
try:
|
|
||||||
for key, value in new_env.items():
|
|
||||||
if key in os.environ:
|
|
||||||
old_values[key] = os.environ[key]
|
|
||||||
else:
|
|
||||||
new_keys.append(key)
|
|
||||||
os.environ[key] = str(value)
|
|
||||||
yield
|
|
||||||
finally:
|
|
||||||
# Restore / clean up
|
|
||||||
for key, value in old_values.items():
|
|
||||||
os.environ[key] = value
|
|
||||||
for key in new_keys:
|
|
||||||
os.environ.pop(key, None)
|
|
||||||
|
|
||||||
|
|
||||||
def launch_lm_eval(eval_config, tp_size):
|
def launch_lm_eval(eval_config, tp_size):
|
||||||
@@ -61,26 +32,23 @@ def launch_lm_eval(eval_config, tp_size):
|
|||||||
f"trust_remote_code={trust_remote_code},"
|
f"trust_remote_code={trust_remote_code},"
|
||||||
f"max_model_len={max_model_len},"
|
f"max_model_len={max_model_len},"
|
||||||
)
|
)
|
||||||
|
results = lm_eval.simple_evaluate(
|
||||||
env_vars = eval_config.get("env_vars", None)
|
model=backend,
|
||||||
with scoped_env_vars(env_vars):
|
model_args=model_args,
|
||||||
results = lm_eval.simple_evaluate(
|
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||||
model=backend,
|
num_fewshot=eval_config["num_fewshot"],
|
||||||
model_args=model_args,
|
limit=eval_config["limit"],
|
||||||
tasks=[task["name"] for task in eval_config["tasks"]],
|
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
|
||||||
num_fewshot=eval_config["num_fewshot"],
|
# text models. however, this is regressing measured strict-match for
|
||||||
limit=eval_config["limit"],
|
# existing text models in CI, so only apply it for mm, or explicitly set
|
||||||
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
|
apply_chat_template=eval_config.get(
|
||||||
# text models. however, this is regressing measured strict-match for
|
"apply_chat_template", backend == "vllm-vlm"
|
||||||
# existing text models in CI, so only apply it for mm, or explicitly set
|
),
|
||||||
apply_chat_template=eval_config.get(
|
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
|
||||||
"apply_chat_template", backend == "vllm-vlm"
|
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
|
||||||
),
|
gen_kwargs=eval_config.get("gen_kwargs"),
|
||||||
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
|
batch_size=batch_size,
|
||||||
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
|
)
|
||||||
gen_kwargs=eval_config.get("gen_kwargs"),
|
|
||||||
batch_size=batch_size,
|
|
||||||
)
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
@@ -89,8 +57,6 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
|
|||||||
|
|
||||||
results = launch_lm_eval(eval_config, tp_size)
|
results = launch_lm_eval(eval_config, tp_size)
|
||||||
|
|
||||||
rtol = eval_config.get("rtol", DEFAULT_RTOL)
|
|
||||||
|
|
||||||
success = True
|
success = True
|
||||||
for task in eval_config["tasks"]:
|
for task in eval_config["tasks"]:
|
||||||
for metric in task["metrics"]:
|
for metric in task["metrics"]:
|
||||||
@@ -98,9 +64,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
|
|||||||
measured_value = results["results"][task["name"]][metric["name"]]
|
measured_value = results["results"][task["name"]][metric["name"]]
|
||||||
print(
|
print(
|
||||||
f"{task['name']} | {metric['name']}: "
|
f"{task['name']} | {metric['name']}: "
|
||||||
f"ground_truth={ground_truth:.3f} | "
|
f"ground_truth={ground_truth} | measured={measured_value}"
|
||||||
f"measured={measured_value:.3f} | rtol={rtol}"
|
|
||||||
)
|
)
|
||||||
success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
|
success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
|
||||||
|
|
||||||
assert success
|
assert success
|
||||||
|
|||||||
@@ -108,65 +108,6 @@ The number of this test is less stable compared to the delay and latency benchma
|
|||||||
|
|
||||||
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
|
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
|
||||||
|
|
||||||
#### Default Parameters Field
|
|
||||||
|
|
||||||
We can specify default parameters in a JSON field with key `defaults`. Parameters defined in the field are applied globally to all serving tests, and can be overridden in test case fields. Here is an example:
|
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary> An Example of default parameters field </summary>
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"defaults": {
|
|
||||||
"qps_list": [
|
|
||||||
"inf"
|
|
||||||
],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"block_size": 128,
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"num_prompts": 200,
|
|
||||||
"ignore-eos": ""
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"tests": [
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama3B_tp2_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_qwen3_tp4_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "Qwen/Qwen3-14B",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "Qwen/Qwen3-14B",
|
|
||||||
}
|
|
||||||
},
|
|
||||||
]
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
### Visualizing the results
|
### Visualizing the results
|
||||||
|
|
||||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
|
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
|
||||||
|
|||||||
@@ -110,8 +110,7 @@ json2envs() {
|
|||||||
wait_for_server() {
|
wait_for_server() {
|
||||||
# wait for vllm server to start
|
# wait for vllm server to start
|
||||||
# return 1 if vllm server crashes
|
# return 1 if vllm server crashes
|
||||||
local timeout_val="1200"
|
timeout 1200 bash -c '
|
||||||
timeout "$timeout_val" bash -c '
|
|
||||||
until curl -X POST localhost:8000/v1/completions; do
|
until curl -X POST localhost:8000/v1/completions; do
|
||||||
sleep 1
|
sleep 1
|
||||||
done' && return 0 || return 1
|
done' && return 0 || return 1
|
||||||
@@ -317,44 +316,12 @@ run_throughput_tests() {
|
|||||||
run_serving_tests() {
|
run_serving_tests() {
|
||||||
# run serving tests using `vllm bench serve` command
|
# run serving tests using `vllm bench serve` command
|
||||||
# $1: a json file specifying serving test cases
|
# $1: a json file specifying serving test cases
|
||||||
#
|
|
||||||
# Supported JSON formats:
|
|
||||||
# 1) Plain format: top-level array
|
|
||||||
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
|
|
||||||
#
|
|
||||||
# 2) Default parameters field + plain format tests
|
|
||||||
# {
|
|
||||||
# "defaults": { ... },
|
|
||||||
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
|
|
||||||
# }
|
|
||||||
|
|
||||||
local serving_test_file
|
local serving_test_file
|
||||||
serving_test_file=$1
|
serving_test_file=$1
|
||||||
|
|
||||||
# Iterate over serving tests
|
# Iterate over serving tests
|
||||||
jq -c '
|
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
||||||
if type == "array" then
|
|
||||||
# Plain format: test cases array
|
|
||||||
.[]
|
|
||||||
elif (type == "object" and has("tests")) then
|
|
||||||
# merge the default parameters into each test cases
|
|
||||||
. as $root
|
|
||||||
| ($root.defaults // {}) as $d
|
|
||||||
| ($root.tests // [])[]
|
|
||||||
# default qps / max_concurrency from defaults if missing
|
|
||||||
| .qps_list = (.qps_list // $d.qps_list)
|
|
||||||
| .max_concurrency_list = (.max_concurrency_list // $d.max_concurrency_list)
|
|
||||||
# merge envs / params: test overrides defaults
|
|
||||||
| .server_environment_variables =
|
|
||||||
(($d.server_environment_variables // {}) + (.server_environment_variables // {}))
|
|
||||||
| .server_parameters =
|
|
||||||
(($d.server_parameters // {}) + (.server_parameters // {}))
|
|
||||||
| .client_parameters =
|
|
||||||
(($d.client_parameters // {}) + (.client_parameters // {}))
|
|
||||||
else
|
|
||||||
error("Unsupported serving test file format: must be array or object with .tests")
|
|
||||||
end
|
|
||||||
' "$serving_test_file" | while read -r params; do
|
|
||||||
# get the test name, and append the GPU type back to it.
|
# get the test name, and append the GPU type back to it.
|
||||||
test_name=$(echo "$params" | jq -r '.test_name')
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
if [[ ! "$test_name" =~ ^serving_ ]]; then
|
if [[ ! "$test_name" =~ ^serving_ ]]; then
|
||||||
@@ -368,25 +335,20 @@ run_serving_tests() {
|
|||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# get client and server arguments (after merged the default parameters)
|
# get client and server arguments
|
||||||
server_params=$(echo "$params" | jq -r '.server_parameters')
|
server_params=$(echo "$params" | jq -r '.server_parameters')
|
||||||
server_envs=$(echo "$params" | jq -r '.server_environment_variables')
|
server_envs=$(echo "$params" | jq -r '.server_environment_variables')
|
||||||
client_params=$(echo "$params" | jq -r '.client_parameters')
|
client_params=$(echo "$params" | jq -r '.client_parameters')
|
||||||
|
|
||||||
server_args=$(json2args "$server_params")
|
server_args=$(json2args "$server_params")
|
||||||
server_envs=$(json2envs "$server_envs")
|
server_envs=$(json2envs "$server_envs")
|
||||||
client_args=$(json2args "$client_params")
|
client_args=$(json2args "$client_params")
|
||||||
|
|
||||||
# qps_list
|
|
||||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||||
echo "Running over qps list $qps_list"
|
echo "Running over qps list $qps_list"
|
||||||
|
|
||||||
# max_concurrency_list (fallback to num_prompts if missing)
|
|
||||||
max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
|
max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
|
||||||
if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
|
if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
|
||||||
num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
|
num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
|
||||||
max_concurrency_list="[$num_prompts]"
|
max_concurrency_list="[$num_prompts]"
|
||||||
fi
|
fi
|
||||||
max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
|
max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
|
||||||
echo "Running over max concurrency list $max_concurrency_list"
|
echo "Running over max concurrency list $max_concurrency_list"
|
||||||
|
|||||||
@@ -0,0 +1,610 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_bf16_tp1_sharegpt",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_bf16_tp2_sharegpt",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_bf16_tp4_sharegpt",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_bf16_tp1_random_128_128",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_bf16_tp2_random_128_128",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_bf16_tp4_random_128_128",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int8_tp1_sharegpt",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int8_tp2_sharegpt",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int8_tp4_sharegpt",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int8_tp1_random_128_128",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int8_tp2_random_128_128",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int8_tp4_random_128_128",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int4_tp1_sharegpt",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"quantization": "awq",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int4_tp2_sharegpt",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"quantization": "awq",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int4_tp4_sharegpt",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"quantization": "awq",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int4_tp1_random_128_128",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"quantization": "awq",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int4_tp2_random_128_128",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"quantization": "awq",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int4_tp4_random_128_128",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"quantization": "awq",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
1023
.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json
Normal file
1023
.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,246 +1,276 @@
|
|||||||
{
|
[
|
||||||
"defaults": {
|
{
|
||||||
"qps_list": [
|
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||||
"inf"
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
],
|
"max_concurrency_list": [32],
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
"server_environment_variables": {
|
||||||
"server_environment_variables": {
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 32
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"server_parameters": {
|
{
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"test_name": "serving_llama8B_tp2_sharegpt",
|
||||||
"tensor_parallel_size": 1,
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
"dtype": "bfloat16",
|
"max_concurrency_list": [32],
|
||||||
"distributed_executor_backend": "mp",
|
"server_environment_variables": {
|
||||||
"block_size": 128,
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"trust_remote_code": "",
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
"disable_log_stats": "",
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
"enforce_eager": "",
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
"max_num_batched_tokens": 2048,
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
"max_num_seqs": 256,
|
},
|
||||||
"load_format": "dummy"
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 32
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
{
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"test_name": "serving_llama8B_tp1_random_128_128",
|
||||||
"backend": "vllm",
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
"ignore-eos": "",
|
"max_concurrency_list": [32],
|
||||||
"num_prompts": 200
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 32
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp2_random_128_128",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"max_concurrency_list": [32],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 32
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp1_random_128_2048",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"max_concurrency_list": [32],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 2048,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 32
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp2_random_128_2048",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"max_concurrency_list": [32],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 2048,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 32
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp1_random_2048_128",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"max_concurrency_list": [32],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 2048,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 32
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp2_random_2048_128",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"max_concurrency_list": [32],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 2048,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 32
|
||||||
|
}
|
||||||
}
|
}
|
||||||
},
|
]
|
||||||
"tests": [
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp2_sharegpt",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 2
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp2_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 2
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp4_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 4
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp1_random_128_2048",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 2048
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp2_random_128_2048",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 2
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 2048
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp4_random_128_2048",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 4
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 2048
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp1_random_2048_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 2048,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp2_random_2048_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 2
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 2048,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp4_random_2048_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 4
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 2048,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama3B_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_granite2B_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "ibm-granite/granite-3.2-2b-instruct",
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "ibm-granite/granite-3.2-2b-instruct",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_qwen1.7B_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "Qwen/Qwen3-1.7B",
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "Qwen/Qwen3-1.7B",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_qwen4B_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "Qwen/Qwen3-4B",
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "Qwen/Qwen3-4B",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_qwen8B_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "Qwen/Qwen3-8B",
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "Qwen/Qwen3-8B",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_glm9B_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "zai-org/glm-4-9b-hf",
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "zai-org/glm-4-9b-hf",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_gemma7B_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "google/gemma-7b",
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "google/gemma-7b",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -8,28 +8,13 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- label: "Build arm64 wheel - CUDA 13.0"
|
|
||||||
depends_on: ~
|
|
||||||
id: build-wheel-arm64-cuda-13-0
|
|
||||||
agents:
|
|
||||||
queue: arm64_cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
|
||||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
|
||||||
- "mkdir artifacts"
|
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
|
|
||||||
# aarch64 build
|
# aarch64 build
|
||||||
- label: "Build arm64 CPU wheel"
|
- label: "Build arm64 CPU wheel"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
@@ -40,11 +25,24 @@ steps:
|
|||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
# x86 + CUDA builds
|
# x86 + CUDA builds
|
||||||
|
- label: "Build wheel - CUDA 12.8"
|
||||||
|
depends_on: ~
|
||||||
|
id: build-wheel-cuda-12-8
|
||||||
|
agents:
|
||||||
|
queue: cpu_queue_postmerge
|
||||||
|
commands:
|
||||||
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
|
- "mkdir artifacts"
|
||||||
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- label: "Build wheel - CUDA 12.9"
|
- label: "Build wheel - CUDA 12.9"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-wheel-cuda-12-9
|
id: build-wheel-cuda-12-9
|
||||||
@@ -54,7 +52,7 @@ steps:
|
|||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_31"
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
@@ -67,21 +65,7 @@ steps:
|
|||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
|
|
||||||
# x86 CPU wheel build
|
|
||||||
- label: "Build x86 CPU wheel"
|
|
||||||
depends_on: ~
|
|
||||||
id: build-wheel-x86-cpu
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
|
||||||
- "mkdir artifacts"
|
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
@@ -125,6 +109,7 @@ steps:
|
|||||||
- label: "Annotate release workflow"
|
- label: "Annotate release workflow"
|
||||||
depends_on:
|
depends_on:
|
||||||
- create-multi-arch-manifest
|
- create-multi-arch-manifest
|
||||||
|
- build-wheel-cuda-12-8
|
||||||
id: annotate-release-workflow
|
id: annotate-release-workflow
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
|
|||||||
@@ -23,8 +23,8 @@ To download the wheel (by version):
|
|||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
|
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
|
||||||
|
|
||||||
|
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
|
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl .
|
|
||||||
\`\`\`
|
\`\`\`
|
||||||
|
|
||||||
To download and upload the image:
|
To download and upload the image:
|
||||||
@@ -45,9 +45,8 @@ docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
|||||||
docker push vllm/vllm-openai:latest-aarch64
|
docker push vllm/vllm-openai:latest-aarch64
|
||||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||||
|
|
||||||
docker manifest rm vllm/vllm-openai:latest
|
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
|
||||||
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
|
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
|
||||||
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
|
||||||
docker manifest push vllm/vllm-openai:latest
|
docker manifest push vllm/vllm-openai:latest
|
||||||
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
|
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
|
||||||
\`\`\`
|
\`\`\`
|
||||||
|
|||||||
@@ -1,400 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
# do not complain about line length (for docstring)
|
|
||||||
# ruff: noqa: E501
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
from dataclasses import asdict, dataclass
|
|
||||||
from datetime import datetime
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any
|
|
||||||
from urllib.parse import quote
|
|
||||||
|
|
||||||
import regex as re
|
|
||||||
|
|
||||||
if not sys.version_info >= (3, 12):
|
|
||||||
raise RuntimeError("This script requires Python 3.12 or higher.")
|
|
||||||
|
|
||||||
INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
|
|
||||||
<html>
|
|
||||||
<!-- {comment} -->
|
|
||||||
<meta name="pypi:repository-version" content="1.0">
|
|
||||||
<body>
|
|
||||||
{items}
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class WheelFileInfo:
|
|
||||||
package_name: str
|
|
||||||
version: str
|
|
||||||
build_tag: str | None
|
|
||||||
python_tag: str
|
|
||||||
abi_tag: str
|
|
||||||
platform_tag: str
|
|
||||||
variant: str | None
|
|
||||||
filename: str
|
|
||||||
|
|
||||||
|
|
||||||
def parse_from_filename(file: str) -> WheelFileInfo:
|
|
||||||
"""
|
|
||||||
Parse wheel file name to extract metadata.
|
|
||||||
|
|
||||||
The format of wheel names:
|
|
||||||
{package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl
|
|
||||||
All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not).
|
|
||||||
Example:
|
|
||||||
vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl
|
|
||||||
vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl
|
|
||||||
vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl
|
|
||||||
vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl
|
|
||||||
"""
|
|
||||||
wheel_file_re = re.compile(
|
|
||||||
r"^(?P<package_name>.+)-(?P<version>[^-]+?)(-(?P<build_tag>[^-]+))?-(?P<python_tag>[^-]+)-(?P<abi_tag>[^-]+)-(?P<platform_tag>[^-]+)\.whl$"
|
|
||||||
)
|
|
||||||
match = wheel_file_re.match(file)
|
|
||||||
if not match:
|
|
||||||
raise ValueError(f"Invalid wheel file name: {file}")
|
|
||||||
|
|
||||||
package_name = match.group("package_name")
|
|
||||||
version = match.group("version")
|
|
||||||
build_tag = match.group("build_tag")
|
|
||||||
python_tag = match.group("python_tag")
|
|
||||||
abi_tag = match.group("abi_tag")
|
|
||||||
platform_tag = match.group("platform_tag")
|
|
||||||
|
|
||||||
# extract variant from version
|
|
||||||
variant = None
|
|
||||||
if "dev" in version:
|
|
||||||
ver_after_dev = version.split("dev")[-1]
|
|
||||||
if "." in ver_after_dev:
|
|
||||||
variant = ver_after_dev.split(".")[-1]
|
|
||||||
version = version.removesuffix("." + variant)
|
|
||||||
else:
|
|
||||||
if "+" in version:
|
|
||||||
version, variant = version.split("+")
|
|
||||||
|
|
||||||
return WheelFileInfo(
|
|
||||||
package_name=package_name,
|
|
||||||
version=version,
|
|
||||||
build_tag=build_tag,
|
|
||||||
python_tag=python_tag,
|
|
||||||
abi_tag=abi_tag,
|
|
||||||
platform_tag=platform_tag,
|
|
||||||
variant=variant,
|
|
||||||
filename=file,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
|
|
||||||
"""
|
|
||||||
Generate project list HTML content linking to each project & variant sub-directory.
|
|
||||||
"""
|
|
||||||
href_tags = []
|
|
||||||
for name in sorted(subdir_names):
|
|
||||||
name = name.strip("/").strip(".")
|
|
||||||
href_tags.append(f' <a href="{name}/">{name}/</a><br/>')
|
|
||||||
return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
|
|
||||||
|
|
||||||
|
|
||||||
def generate_package_index_and_metadata(
|
|
||||||
wheel_files: list[WheelFileInfo],
|
|
||||||
wheel_base_dir: Path,
|
|
||||||
index_base_dir: Path,
|
|
||||||
comment: str = "",
|
|
||||||
) -> tuple[str, str]:
|
|
||||||
"""
|
|
||||||
Generate package index HTML content for a specific package, linking to actual wheel files.
|
|
||||||
"""
|
|
||||||
href_tags = []
|
|
||||||
metadata = []
|
|
||||||
for file in sorted(wheel_files, key=lambda x: x.filename):
|
|
||||||
relative_path = (
|
|
||||||
wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
|
|
||||||
)
|
|
||||||
# handle with '+' in URL, and avoid double-encoding '/' and already-encoded '%2B'
|
|
||||||
# NOTE: this is AWS S3 specific behavior!
|
|
||||||
file_path_quoted = quote(relative_path.as_posix(), safe=":%/")
|
|
||||||
href_tags.append(f' <a href="{file_path_quoted}">{file.filename}</a><br/>')
|
|
||||||
file_meta = asdict(file)
|
|
||||||
file_meta["path"] = file_path_quoted
|
|
||||||
metadata.append(file_meta)
|
|
||||||
index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
|
|
||||||
metadata_str = json.dumps(metadata, indent=2)
|
|
||||||
return index_str, metadata_str
|
|
||||||
|
|
||||||
|
|
||||||
def generate_index_and_metadata(
|
|
||||||
whl_files: list[str],
|
|
||||||
wheel_base_dir: Path,
|
|
||||||
index_base_dir: Path,
|
|
||||||
default_variant: str | None = None,
|
|
||||||
alias_to_default: str | None = None,
|
|
||||||
comment: str = "",
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Generate index for all wheel files.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`).
|
|
||||||
wheel_base_dir (Path): Base directory for wheel files.
|
|
||||||
index_base_dir (Path): Base directory to store index files.
|
|
||||||
default_variant (str | None): The default variant name, if any.
|
|
||||||
alias_to_default (str | None): Alias variant name for the default variant, if any.
|
|
||||||
comment (str | None): Optional comment to include in the generated HTML files.
|
|
||||||
|
|
||||||
First, parse all wheel files to extract metadata.
|
|
||||||
We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
|
|
||||||
The index for the default variant (if any) is generated in the root index directory.
|
|
||||||
|
|
||||||
If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
|
|
||||||
is purely a copy of the corresponding variant index, with only the links adjusted.
|
|
||||||
Otherwise, all wheels without variant suffixes are treated as the default variant.
|
|
||||||
|
|
||||||
If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
|
|
||||||
as the default variant index, but the links are adjusted accordingly.
|
|
||||||
|
|
||||||
Index directory structure:
|
|
||||||
index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
|
|
||||||
index.html # project list, linking to "vllm/" and other packages, and all variant sub-directories
|
|
||||||
vllm/
|
|
||||||
index.html # package index, pointing to actual files in wheel_base_dir (relative path)
|
|
||||||
metadata.json # machine-readable metadata for all wheels in this package
|
|
||||||
cpu/ # cpu variant sub-directory
|
|
||||||
index.html
|
|
||||||
vllm/
|
|
||||||
index.html
|
|
||||||
metadata.json
|
|
||||||
cu129/ # cu129 is actually the alias to default variant
|
|
||||||
index.html
|
|
||||||
vllm/
|
|
||||||
index.html
|
|
||||||
metadata.json
|
|
||||||
cu130/ # cu130 variant sub-directory
|
|
||||||
index.html
|
|
||||||
vllm/
|
|
||||||
index.html
|
|
||||||
metadata.json
|
|
||||||
...
|
|
||||||
|
|
||||||
metadata.json stores a dump of all wheel files' metadata in a machine-readable format:
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"package_name": "vllm",
|
|
||||||
"version": "0.10.2rc2",
|
|
||||||
"build_tag": null,
|
|
||||||
"python_tag": "cp38",
|
|
||||||
"abi_tag": "abi3",
|
|
||||||
"platform_tag": "manylinux2014_aarch64",
|
|
||||||
"variant": "cu129",
|
|
||||||
"filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl",
|
|
||||||
"path": "../vllm-0.10.2rc2%2Bcu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL and URL-encoded
|
|
||||||
},
|
|
||||||
...
|
|
||||||
]
|
|
||||||
"""
|
|
||||||
|
|
||||||
parsed_files = [parse_from_filename(f) for f in whl_files]
|
|
||||||
|
|
||||||
if not parsed_files:
|
|
||||||
print("No wheel files found, skipping index generation.")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Group by variant
|
|
||||||
variant_to_files: dict[str, list[WheelFileInfo]] = {}
|
|
||||||
for file in parsed_files:
|
|
||||||
variant = file.variant or "default"
|
|
||||||
if variant not in variant_to_files:
|
|
||||||
variant_to_files[variant] = []
|
|
||||||
variant_to_files[variant].append(file)
|
|
||||||
|
|
||||||
print(f"Found variants: {list(variant_to_files.keys())}")
|
|
||||||
|
|
||||||
# sanity check for default variant
|
|
||||||
if default_variant:
|
|
||||||
if "default" in variant_to_files:
|
|
||||||
raise ValueError(
|
|
||||||
"All wheel files must have variant suffixes when `default_variant` is specified."
|
|
||||||
)
|
|
||||||
if default_variant not in variant_to_files:
|
|
||||||
raise ValueError(
|
|
||||||
f"Default variant '{default_variant}' not found among wheel files."
|
|
||||||
)
|
|
||||||
|
|
||||||
if alias_to_default:
|
|
||||||
if "default" not in variant_to_files:
|
|
||||||
# e.g. only some wheels are uploaded to S3 currently
|
|
||||||
print(
|
|
||||||
"[WARN] Alias to default variant specified, but no default variant found."
|
|
||||||
)
|
|
||||||
elif alias_to_default in variant_to_files:
|
|
||||||
raise ValueError(
|
|
||||||
f"Alias variant name '{alias_to_default}' already exists among wheel files."
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
variant_to_files[alias_to_default] = variant_to_files["default"].copy()
|
|
||||||
print(f"Alias variant '{alias_to_default}' created for default variant.")
|
|
||||||
|
|
||||||
# Generate comment in HTML header
|
|
||||||
comment_str = f" ({comment})" if comment else ""
|
|
||||||
comment_tmpl = f"Generated on {datetime.now().isoformat()}{comment_str}"
|
|
||||||
|
|
||||||
# Generate index for each variant
|
|
||||||
subdir_names = set()
|
|
||||||
for variant, files in variant_to_files.items():
|
|
||||||
if variant == "default":
|
|
||||||
variant_dir = index_base_dir
|
|
||||||
else:
|
|
||||||
variant_dir = index_base_dir / variant
|
|
||||||
subdir_names.add(variant)
|
|
||||||
|
|
||||||
variant_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
# gather all package names in this variant
|
|
||||||
packages = set(f.package_name for f in files)
|
|
||||||
if variant == "default":
|
|
||||||
# these packages should also appear in the "project list"
|
|
||||||
# generate after all variants are processed
|
|
||||||
subdir_names = subdir_names.union(packages)
|
|
||||||
else:
|
|
||||||
# generate project list for this variant directly
|
|
||||||
project_list_str = generate_project_list(sorted(packages), comment_tmpl)
|
|
||||||
with open(variant_dir / "index.html", "w") as f:
|
|
||||||
f.write(project_list_str)
|
|
||||||
|
|
||||||
for package in packages:
|
|
||||||
# filter files belonging to this package only
|
|
||||||
package_files = [f for f in files if f.package_name == package]
|
|
||||||
package_dir = variant_dir / package
|
|
||||||
package_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
index_str, metadata_str = generate_package_index_and_metadata(
|
|
||||||
package_files, wheel_base_dir, package_dir, comment
|
|
||||||
)
|
|
||||||
with open(package_dir / "index.html", "w") as f:
|
|
||||||
f.write(index_str)
|
|
||||||
with open(package_dir / "metadata.json", "w") as f:
|
|
||||||
f.write(metadata_str)
|
|
||||||
|
|
||||||
# Generate top-level project list index
|
|
||||||
project_list_str = generate_project_list(sorted(subdir_names), comment_tmpl)
|
|
||||||
with open(index_base_dir / "index.html", "w") as f:
|
|
||||||
f.write(project_list_str)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
"""
|
|
||||||
Arguments:
|
|
||||||
--version <version> : version string for the current build (e.g., commit hash)
|
|
||||||
--current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
|
|
||||||
--output-dir <output_directory> : directory to store generated index files
|
|
||||||
--alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
|
|
||||||
--comment <comment_string> : (optional) comment string to include in generated HTML files
|
|
||||||
"""
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="Process nightly build wheel files to generate indices."
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--version",
|
|
||||||
type=str,
|
|
||||||
required=True,
|
|
||||||
help="Version string for the current build (e.g., commit hash)",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--current-objects",
|
|
||||||
type=str,
|
|
||||||
required=True,
|
|
||||||
help="Path to JSON file containing current S3 objects listing in this version directory",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--output-dir",
|
|
||||||
type=str,
|
|
||||||
required=True,
|
|
||||||
help="Directory to store generated index files",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--alias-to-default",
|
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
help="Alias variant name for the default variant",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--comment",
|
|
||||||
type=str,
|
|
||||||
default="",
|
|
||||||
help="Optional comment string to include in generated HTML files",
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
version = args.version
|
|
||||||
if "/" in version or "\\" in version:
|
|
||||||
raise ValueError("Version string must not contain slashes.")
|
|
||||||
current_objects_path = Path(args.current_objects)
|
|
||||||
output_dir = Path(args.output_dir)
|
|
||||||
if not output_dir.exists():
|
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
# Read current objects JSON
|
|
||||||
with open(current_objects_path) as f:
|
|
||||||
current_objects: dict[str, list[dict[str, Any]]] = json.load(f)
|
|
||||||
|
|
||||||
# current_objects looks like from list_objects_v2 S3 API:
|
|
||||||
"""
|
|
||||||
"Contents": [
|
|
||||||
{
|
|
||||||
"Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl",
|
|
||||||
"LastModified": "2025-11-28T14:00:32+00:00",
|
|
||||||
"ETag": "\"37a38339c7cdb61ca737021b968075df-52\"",
|
|
||||||
"ChecksumAlgorithm": [
|
|
||||||
"CRC64NVME"
|
|
||||||
],
|
|
||||||
"ChecksumType": "FULL_OBJECT",
|
|
||||||
"Size": 435649349,
|
|
||||||
"StorageClass": "STANDARD"
|
|
||||||
},
|
|
||||||
...
|
|
||||||
]
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Extract wheel file keys
|
|
||||||
wheel_files = []
|
|
||||||
for item in current_objects.get("Contents", []):
|
|
||||||
key: str = item["Key"]
|
|
||||||
if key.endswith(".whl"):
|
|
||||||
wheel_files.append(key.split("/")[-1]) # only the filename is used
|
|
||||||
|
|
||||||
print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
|
|
||||||
|
|
||||||
# keep only "official" files for a non-nightly version (specifed by cli args)
|
|
||||||
PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$")
|
|
||||||
if PY_VERSION_RE.match(version):
|
|
||||||
# upload-wheels.sh ensures no "dev" is in args.version
|
|
||||||
wheel_files = list(
|
|
||||||
filter(lambda x: version in x and "dev" not in x, wheel_files)
|
|
||||||
)
|
|
||||||
print(f"Non-nightly version detected, wheel files used: {wheel_files}")
|
|
||||||
else:
|
|
||||||
print("Nightly version detected, keeping all wheel files.")
|
|
||||||
|
|
||||||
# Generate index and metadata, assuming wheels and indices are stored as:
|
|
||||||
# s3://vllm-wheels/{version}/<wheel files>
|
|
||||||
# s3://vllm-wheels/<anything>/<index files>
|
|
||||||
wheel_base_dir = Path(output_dir).parent / version
|
|
||||||
index_base_dir = Path(output_dir)
|
|
||||||
|
|
||||||
generate_index_and_metadata(
|
|
||||||
whl_files=wheel_files,
|
|
||||||
wheel_base_dir=wheel_base_dir,
|
|
||||||
index_base_dir=index_base_dir,
|
|
||||||
default_variant=None,
|
|
||||||
alias_to_default=args.alias_to_default,
|
|
||||||
comment=args.comment.strip(),
|
|
||||||
)
|
|
||||||
print(f"Successfully generated index and metadata in {output_dir}")
|
|
||||||
@@ -1,68 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script build the CPU docker image and run the offline inference inside the container.
|
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
# allow to bind to different cores
|
|
||||||
CORE_RANGE=${CORE_RANGE:-0-16}
|
|
||||||
OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16}
|
|
||||||
|
|
||||||
export CMAKE_BUILD_PARALLEL_LEVEL=16
|
|
||||||
|
|
||||||
# Setup cleanup
|
|
||||||
remove_docker_container() {
|
|
||||||
set -e;
|
|
||||||
docker rm -f cpu-test || true;
|
|
||||||
}
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
docker build --tag cpu-test --target vllm-test -f docker/Dockerfile.cpu .
|
|
||||||
|
|
||||||
# Run the image
|
|
||||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test cpu-test
|
|
||||||
|
|
||||||
function cpu_tests() {
|
|
||||||
set -e
|
|
||||||
|
|
||||||
docker exec cpu-test bash -c "
|
|
||||||
set -e
|
|
||||||
pip list"
|
|
||||||
|
|
||||||
# offline inference
|
|
||||||
docker exec cpu-test bash -c "
|
|
||||||
set -e
|
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
|
||||||
|
|
||||||
# Run model tests
|
|
||||||
docker exec cpu-test bash -c "
|
|
||||||
set -e
|
|
||||||
pytest -x -v -s tests/models/multimodal/generation/test_whisper.py -m cpu_model"
|
|
||||||
|
|
||||||
# Run kernel tests
|
|
||||||
docker exec cpu-test bash -c "
|
|
||||||
set -e
|
|
||||||
pytest -x -v -s tests/kernels/test_onednn.py
|
|
||||||
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
|
|
||||||
pytest -x -v -s tests/kernels/moe/test_moe.py -k test_cpu_fused_moe_basic"
|
|
||||||
|
|
||||||
# basic online serving
|
|
||||||
docker exec cpu-test bash -c '
|
|
||||||
set -e
|
|
||||||
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve Qwen/Qwen3-0.6B --max-model-len 2048 &
|
|
||||||
server_pid=$!
|
|
||||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
|
||||||
vllm bench serve \
|
|
||||||
--backend vllm \
|
|
||||||
--dataset-name random \
|
|
||||||
--model Qwen/Qwen3-0.6B \
|
|
||||||
--num-prompts 20 \
|
|
||||||
--endpoint /v1/completions
|
|
||||||
kill -s SIGTERM $server_pid &'
|
|
||||||
}
|
|
||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
|
||||||
export -f cpu_tests
|
|
||||||
timeout 2h bash -c cpu_tests
|
|
||||||
@@ -25,22 +25,20 @@ function cpu_tests() {
|
|||||||
|
|
||||||
# offline inference
|
# offline inference
|
||||||
podman exec -it "$container_id" bash -c "
|
podman exec -it "$container_id" bash -c "
|
||||||
export TORCH_COMPILE_DISABLE=1
|
|
||||||
set -xve
|
set -xve
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
|
||||||
|
|
||||||
# Run basic model test
|
# Run basic model test
|
||||||
podman exec -it "$container_id" bash -c "
|
podman exec -it "$container_id" bash -c "
|
||||||
export TORCH_COMPILE_DISABLE=1
|
|
||||||
set -evx
|
set -evx
|
||||||
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
||||||
pip install sentence-transformers datamodel_code_generator tblib
|
pip install sentence-transformers datamodel_code_generator
|
||||||
|
|
||||||
# Note: disable Bart until supports V1
|
# Note: disable Bart until supports V1
|
||||||
# pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
# pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-openai-community/gpt2]
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-facebook/opt-125m]
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
|
||||||
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
||||||
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
|
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
|
||||||
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
|
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
|
||||||
|
|||||||
@@ -21,8 +21,8 @@ trap remove_docker_container EXIT
|
|||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
|
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
|
||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
||||||
|
|
||||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
||||||
@@ -73,11 +73,12 @@ function cpu_tests() {
|
|||||||
pytest -x -s -v \
|
pytest -x -s -v \
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
|
||||||
|
|
||||||
# Run AWQ/GPTQ test
|
# Note: disable it until supports V1
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
# Run AWQ test
|
||||||
set -e
|
# docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
pytest -x -s -v \
|
# set -e
|
||||||
tests/quantization/test_cpu_wna16.py"
|
# pytest -x -s -v \
|
||||||
|
# tests/quantization/test_ipex_quant.py"
|
||||||
|
|
||||||
# Run multi-lora tests
|
# Run multi-lora tests
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
|
|||||||
@@ -74,7 +74,6 @@ FROM ${BASE_IMAGE_NAME}
|
|||||||
|
|
||||||
# Define environments
|
# Define environments
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
ENV SOC_VERSION="ascend910b1"
|
|
||||||
|
|
||||||
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
|
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
|
||||||
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
|
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
|
||||||
|
|||||||
@@ -35,10 +35,9 @@ docker run \
|
|||||||
echo $ZE_AFFINITY_MASK
|
echo $ZE_AFFINITY_MASK
|
||||||
pip install tblib==3.1.0
|
pip install tblib==3.1.0
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||||
python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
|
|
||||||
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||||
cd tests
|
cd tests
|
||||||
pytest -v -s v1/core
|
pytest -v -s v1/core
|
||||||
@@ -47,6 +46,6 @@ docker run \
|
|||||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
||||||
pytest -v -s v1/structured_output
|
pytest -v -s v1/structured_output
|
||||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
|
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
|
||||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
|
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
|
||||||
pytest -v -s v1/test_serial_utils.py
|
pytest -v -s v1/test_serial_utils.py
|
||||||
'
|
'
|
||||||
|
|||||||
@@ -12,11 +12,6 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
|||||||
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
|
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
|
||||||
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
|
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
|
||||||
|
|
||||||
if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
|
|
||||||
echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Setting up Prime-RL integration test environment..."
|
echo "Setting up Prime-RL integration test environment..."
|
||||||
|
|
||||||
# Clean up any existing Prime-RL directory
|
# Clean up any existing Prime-RL directory
|
||||||
|
|||||||
@@ -17,17 +17,7 @@ wait_for_server() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
MODEL="deepseek-ai/DeepSeek-V2-lite"
|
MODEL="deepseek-ai/DeepSeek-V2-lite"
|
||||||
|
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||||
# Set BACKENDS based on platform
|
|
||||||
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
|
|
||||||
# ROCm platform
|
|
||||||
BACKENDS=("allgather_reducescatter")
|
|
||||||
# Disable MOE padding for ROCm since it is causing eplb to fail
|
|
||||||
export VLLM_ROCM_MOE_PADDING=0
|
|
||||||
else
|
|
||||||
# Non-ROCm platform (CUDA/other)
|
|
||||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
|
||||||
fi
|
|
||||||
|
|
||||||
cleanup() {
|
cleanup() {
|
||||||
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
||||||
|
|||||||
@@ -2,9 +2,9 @@
|
|||||||
set -euxo pipefail
|
set -euxo pipefail
|
||||||
|
|
||||||
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
||||||
THRESHOLD=${1:-0.25}
|
THRESHOLD=${1:-0.8}
|
||||||
NUM_Q=${2:-1319}
|
NUM_Q=${2:-1319}
|
||||||
PORT=${3:-8040}
|
PORT=${3:-8020}
|
||||||
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
||||||
mkdir -p "${OUT_DIR}"
|
mkdir -p "${OUT_DIR}"
|
||||||
|
|
||||||
@@ -16,18 +16,8 @@ wait_for_server() {
|
|||||||
done'
|
done'
|
||||||
}
|
}
|
||||||
|
|
||||||
MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
|
MODEL="QWen/Qwen3-30B-A3B-FP8"
|
||||||
|
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||||
# Set BACKENDS based on platform
|
|
||||||
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
|
|
||||||
# ROCm platform
|
|
||||||
BACKENDS=("allgather_reducescatter")
|
|
||||||
# Disable MOE padding for ROCm since it is causing eplb to fail
|
|
||||||
export VLLM_ROCM_MOE_PADDING=0
|
|
||||||
else
|
|
||||||
# Non-ROCm platform (CUDA/other)
|
|
||||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
|
||||||
fi
|
|
||||||
|
|
||||||
cleanup() {
|
cleanup() {
|
||||||
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
||||||
@@ -46,14 +36,11 @@ for BACK in "${BACKENDS[@]}"; do
|
|||||||
VLLM_ALL2ALL_BACKEND=$BACK \
|
VLLM_ALL2ALL_BACKEND=$BACK \
|
||||||
vllm serve "$MODEL" \
|
vllm serve "$MODEL" \
|
||||||
--enforce-eager \
|
--enforce-eager \
|
||||||
--tensor-parallel-size 4 \
|
--tensor-parallel-size 2 \
|
||||||
|
--data-parallel-size 2 \
|
||||||
--enable-expert-parallel \
|
--enable-expert-parallel \
|
||||||
--enable-eplb \
|
|
||||||
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
|
|
||||||
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
|
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--max-model-len 2048 \
|
--max-model-len 2048 \
|
||||||
--gpu-memory-utilization 0.9 \
|
|
||||||
--port $PORT &
|
--port $PORT &
|
||||||
SERVER_PID=$!
|
SERVER_PID=$!
|
||||||
wait_for_server $PORT
|
wait_for_server $PORT
|
||||||
@@ -1,74 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euxo pipefail
|
|
||||||
|
|
||||||
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] [DATA_PARALLEL_SIZE] [TENSOR_PARALLEL_SIZE]
|
|
||||||
THRESHOLD=${1:-0.8}
|
|
||||||
NUM_Q=${2:-1319}
|
|
||||||
PORT=${3:-8020}
|
|
||||||
DATA_PARALLEL_SIZE=${4:-2}
|
|
||||||
TENSOR_PARALLEL_SIZE=${5:-2}
|
|
||||||
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
|
||||||
mkdir -p "${OUT_DIR}"
|
|
||||||
|
|
||||||
wait_for_server() {
|
|
||||||
local port=$1
|
|
||||||
timeout 600 bash -c '
|
|
||||||
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
|
||||||
sleep 1
|
|
||||||
done'
|
|
||||||
}
|
|
||||||
|
|
||||||
MODEL="QWen/Qwen3-30B-A3B-FP8"
|
|
||||||
# Set BACKENDS based on platform
|
|
||||||
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
|
|
||||||
# ROCm platform
|
|
||||||
BACKENDS=("allgather_reducescatter")
|
|
||||||
# Disable MOE padding for ROCm since it is causing eplb to fail
|
|
||||||
export VLLM_ROCM_MOE_PADDING=0
|
|
||||||
else
|
|
||||||
# Non-ROCm platform (CUDA/other)
|
|
||||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
|
||||||
fi
|
|
||||||
|
|
||||||
cleanup() {
|
|
||||||
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
|
||||||
kill "${SERVER_PID}" 2>/dev/null || true
|
|
||||||
for _ in {1..20}; do
|
|
||||||
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
|
||||||
sleep 0.5
|
|
||||||
done
|
|
||||||
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
trap cleanup EXIT
|
|
||||||
|
|
||||||
for BACK in "${BACKENDS[@]}"; do
|
|
||||||
VLLM_DEEP_GEMM_WARMUP=skip \
|
|
||||||
VLLM_ALL2ALL_BACKEND=$BACK \
|
|
||||||
vllm serve "$MODEL" \
|
|
||||||
--enforce-eager \
|
|
||||||
--enable-eplb \
|
|
||||||
--eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
|
|
||||||
--tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
|
|
||||||
--data-parallel-size ${DATA_PARALLEL_SIZE} \
|
|
||||||
--enable-expert-parallel \
|
|
||||||
--trust-remote-code \
|
|
||||||
--max-model-len 2048 \
|
|
||||||
--port $PORT &
|
|
||||||
SERVER_PID=$!
|
|
||||||
wait_for_server $PORT
|
|
||||||
|
|
||||||
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
|
||||||
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
|
||||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
|
||||||
python3 - <<PY
|
|
||||||
import json; acc=json.load(open('${OUT}'))['accuracy']
|
|
||||||
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
|
||||||
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
|
|
||||||
PY
|
|
||||||
|
|
||||||
cleanup
|
|
||||||
SERVER_PID=
|
|
||||||
sleep 1
|
|
||||||
PORT=$((PORT+1))
|
|
||||||
done
|
|
||||||
@@ -2,28 +2,6 @@
|
|||||||
|
|
||||||
set -ex
|
set -ex
|
||||||
|
|
||||||
# ======== part 0: setup ========
|
|
||||||
|
|
||||||
BUCKET="vllm-wheels"
|
|
||||||
INDICES_OUTPUT_DIR="indices"
|
|
||||||
DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
|
|
||||||
PYTHON=${PYTHON_PROG:=python3} # try to read from env var, otherwise use python3
|
|
||||||
SUBPATH=$BUILDKITE_COMMIT
|
|
||||||
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
|
|
||||||
|
|
||||||
# detect if python3.10+ is available
|
|
||||||
has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)")
|
|
||||||
if [[ "$has_new_python" -eq 0 ]]; then
|
|
||||||
# use new python from docker
|
|
||||||
docker pull python:3-slim
|
|
||||||
PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Using python interpreter: $PYTHON"
|
|
||||||
echo "Python version: $($PYTHON --version)"
|
|
||||||
|
|
||||||
# ========= part 1: collect, rename & upload the wheel ==========
|
|
||||||
|
|
||||||
# Assume wheels are in artifacts/dist/*.whl
|
# Assume wheels are in artifacts/dist/*.whl
|
||||||
wheel_files=(artifacts/dist/*.whl)
|
wheel_files=(artifacts/dist/*.whl)
|
||||||
|
|
||||||
@@ -32,76 +10,74 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
|
|||||||
echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
|
echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Get the single wheel file
|
||||||
wheel="${wheel_files[0]}"
|
wheel="${wheel_files[0]}"
|
||||||
|
|
||||||
# default build image uses ubuntu 20.04, which corresponds to manylinux_2_31
|
# Detect architecture and rename 'linux' to appropriate manylinux version
|
||||||
# we also accept params as manylinux tag
|
arch=$(uname -m)
|
||||||
# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
|
if [[ $arch == "x86_64" ]]; then
|
||||||
manylinux_version="${1:-manylinux_2_31}"
|
manylinux_version="manylinux1"
|
||||||
|
elif [[ $arch == "aarch64" ]]; then
|
||||||
|
manylinux_version="manylinux2014"
|
||||||
|
else
|
||||||
|
echo "Warning: Unknown architecture $arch, using manylinux1 as default"
|
||||||
|
manylinux_version="manylinux1"
|
||||||
|
fi
|
||||||
|
|
||||||
# Rename 'linux' to the appropriate manylinux version in the wheel filename
|
# Rename 'linux' to the appropriate manylinux version in the wheel filename
|
||||||
if [[ "$wheel" != *"linux"* ]]; then
|
|
||||||
echo "Error: Wheel filename does not contain 'linux': $wheel"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
new_wheel="${wheel/linux/$manylinux_version}"
|
new_wheel="${wheel/linux/$manylinux_version}"
|
||||||
mv -- "$wheel" "$new_wheel"
|
mv -- "$wheel" "$new_wheel"
|
||||||
wheel="$new_wheel"
|
wheel="$new_wheel"
|
||||||
echo "Renamed wheel to: $wheel"
|
|
||||||
|
|
||||||
# Extract the version from the wheel
|
# Extract the version from the wheel
|
||||||
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
||||||
echo "Version in wheel: $version"
|
echo "Version: $version"
|
||||||
pure_version="${version%%+*}"
|
|
||||||
echo "Pure version (without variant): $pure_version"
|
|
||||||
|
|
||||||
# copy wheel to its own bucket
|
normal_wheel="$wheel" # Save the original wheel filename
|
||||||
aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
|
|
||||||
|
|
||||||
# ========= part 2: generate and upload indices ==========
|
# If the version contains "dev", rename it to v1.0.0.dev for consistency
|
||||||
# generate indices for all existing wheels in the commit directory
|
if [[ $version == *dev* ]]; then
|
||||||
# this script might be run multiple times if there are multiple variants being built
|
suffix="${version##*.}"
|
||||||
# so we need to guarantee there is little chance for "TOCTOU" issues
|
if [[ $suffix == cu* ]]; then
|
||||||
# i.e., one process is generating indices while another is uploading a new wheel
|
new_version="1.0.0.dev+${suffix}"
|
||||||
# so we need to ensure no time-consuming operations happen below
|
else
|
||||||
|
new_version="1.0.0.dev"
|
||||||
|
fi
|
||||||
|
new_wheel="${wheel/$version/$new_version}"
|
||||||
|
# use cp to keep both files in the artifacts directory
|
||||||
|
cp -- "$wheel" "$new_wheel"
|
||||||
|
wheel="$new_wheel"
|
||||||
|
version="$new_version"
|
||||||
|
fi
|
||||||
|
|
||||||
# list all wheels in the commit directory
|
# Upload the wheel to S3
|
||||||
echo "Existing wheels on S3:"
|
python3 .buildkite/generate_index.py --wheel "$normal_wheel"
|
||||||
aws s3 ls "$S3_COMMIT_PREFIX"
|
|
||||||
obj_json="objects.json"
|
|
||||||
aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
|
|
||||||
mkdir -p "$INDICES_OUTPUT_DIR"
|
|
||||||
|
|
||||||
# call script to generate indicies for all existing wheels
|
# generate index for this commit
|
||||||
# this indices have relative paths that could work as long as it is next to the wheel directory in s3
|
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||||
# i.e., the wheels are always in s3://vllm-wheels/<commit>/
|
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||||
# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
|
|
||||||
if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
|
if [[ $normal_wheel == *"cu129"* ]]; then
|
||||||
alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
|
# only upload index.html for cu129 wheels (default wheels) as it
|
||||||
|
# is available on both x86 and arm64
|
||||||
|
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
||||||
|
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
||||||
else
|
else
|
||||||
alias_arg=""
|
echo "Skipping index files for non-cu129 wheels"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# HACK: we do not need regex module here, but it is required by pre-commit hook
|
# generate index for nightly
|
||||||
# To avoid any external dependency, we simply replace it back to the stdlib re module
|
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
|
||||||
sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
|
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
||||||
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg
|
|
||||||
|
|
||||||
# copy indices to /<commit>/ unconditionally
|
if [[ $normal_wheel == *"cu129"* ]]; then
|
||||||
echo "Uploading indices to $S3_COMMIT_PREFIX"
|
# only upload index.html for cu129 wheels (default wheels) as it
|
||||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
|
# is available on both x86 and arm64
|
||||||
|
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
||||||
# copy to /nightly/ only if it is on the main branch and not a PR
|
else
|
||||||
if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
|
echo "Skipping index files for non-cu129 wheels"
|
||||||
echo "Uploading indices to overwrite /nightly/"
|
|
||||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version
|
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
||||||
if [[ "$version" != *"dev"* ]]; then
|
aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
|
||||||
echo "Re-generating indices for /$pure_version/"
|
|
||||||
rm -rf "$INDICES_OUTPUT_DIR/*"
|
|
||||||
mkdir -p "$INDICES_OUTPUT_DIR"
|
|
||||||
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
|
|
||||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
|
|
||||||
fi
|
|
||||||
|
|||||||
@@ -39,9 +39,9 @@ steps:
|
|||||||
# if this test fails, it means the nightly torch version is not compatible with some
|
# if this test fails, it means the nightly torch version is not compatible with some
|
||||||
# of the dependencies. Please check the error message and add the package to whitelist
|
# of the dependencies. Please check the error message and add the package to whitelist
|
||||||
# in /vllm/tools/pre_commit/generate_nightly_torch_test.py
|
# in /vllm/tools/pre_commit/generate_nightly_torch_test.py
|
||||||
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
mirror_hardwares: [amdexperimental]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
grade: Blocking
|
# grade: Blocking
|
||||||
soft_fail: true
|
soft_fail: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- requirements/nightly_torch_test.txt
|
- requirements/nightly_torch_test.txt
|
||||||
@@ -50,9 +50,9 @@ steps:
|
|||||||
|
|
||||||
- label: Async Engine, Inputs, Utils, Worker Test # 10min
|
- label: Async Engine, Inputs, Utils, Worker Test # 10min
|
||||||
timeout_in_minutes: 15
|
timeout_in_minutes: 15
|
||||||
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
grade: Blocking
|
# grade: Blocking
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/multimodal
|
- tests/multimodal
|
||||||
@@ -61,31 +61,25 @@ steps:
|
|||||||
- pytest -v -s -m 'not cpu_test' multimodal
|
- pytest -v -s -m 'not cpu_test' multimodal
|
||||||
- pytest -v -s utils_
|
- pytest -v -s utils_
|
||||||
|
|
||||||
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
|
- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 10
|
||||||
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
grade: Blocking
|
# grade: Blocking
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/test_inputs.py
|
- tests/test_inputs.py
|
||||||
- tests/test_outputs.py
|
- tests/test_outputs.py
|
||||||
- tests/multimodal
|
- tests/multimodal
|
||||||
- tests/standalone_tests/lazy_imports.py
|
- tests/standalone_tests/lazy_imports.py
|
||||||
- tests/tokenizers_
|
|
||||||
- tests/tool_parsers
|
|
||||||
- tests/transformers_utils
|
- tests/transformers_utils
|
||||||
- tests/config
|
|
||||||
no_gpu: true
|
no_gpu: true
|
||||||
commands:
|
commands:
|
||||||
- python3 standalone_tests/lazy_imports.py
|
- python3 standalone_tests/lazy_imports.py
|
||||||
- pytest -v -s test_inputs.py
|
- pytest -v -s test_inputs.py
|
||||||
- pytest -v -s test_outputs.py
|
- pytest -v -s test_outputs.py
|
||||||
- pytest -v -s -m 'cpu_test' multimodal
|
- pytest -v -s -m 'cpu_test' multimodal
|
||||||
- pytest -v -s tokenizers_
|
|
||||||
- pytest -v -s tool_parsers
|
|
||||||
- pytest -v -s transformers_utils
|
- pytest -v -s transformers_utils
|
||||||
- pytest -v -s config
|
|
||||||
|
|
||||||
- label: Python-only Installation Test # 10min
|
- label: Python-only Installation Test # 10min
|
||||||
timeout_in_minutes: 20
|
timeout_in_minutes: 20
|
||||||
@@ -117,9 +111,9 @@ steps:
|
|||||||
- pytest -v -s basic_correctness/test_cpu_offload.py
|
- pytest -v -s basic_correctness/test_cpu_offload.py
|
||||||
|
|
||||||
- label: Entrypoints Unit Tests # 5min
|
- label: Entrypoints Unit Tests # 5min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
grade: Blocking
|
# grade: Blocking
|
||||||
timeout_in_minutes: 10
|
timeout_in_minutes: 10
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
fast_check: true
|
fast_check: true
|
||||||
@@ -193,7 +187,7 @@ steps:
|
|||||||
- tests/distributed/test_utils
|
- tests/distributed/test_utils
|
||||||
- tests/distributed/test_pynccl
|
- tests/distributed/test_pynccl
|
||||||
- tests/distributed/test_events
|
- tests/distributed/test_events
|
||||||
- tests/compile/fullgraph/test_basic_correctness.py
|
- tests/compile/test_basic_correctness
|
||||||
- examples/offline_inference/rlhf.py
|
- examples/offline_inference/rlhf.py
|
||||||
- examples/offline_inference/rlhf_colocate.py
|
- examples/offline_inference/rlhf_colocate.py
|
||||||
- tests/examples/offline_inference/data_parallel.py
|
- tests/examples/offline_inference/data_parallel.py
|
||||||
@@ -216,13 +210,12 @@ steps:
|
|||||||
# test with internal dp
|
# test with internal dp
|
||||||
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
|
||||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
||||||
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
||||||
- pytest -v -s distributed/test_utils.py
|
- pytest -v -s distributed/test_utils.py
|
||||||
- pytest -v -s compile/fullgraph/test_basic_correctness.py
|
- pytest -v -s compile/test_basic_correctness.py
|
||||||
- pytest -v -s distributed/test_pynccl.py
|
- pytest -v -s distributed/test_pynccl.py
|
||||||
- pytest -v -s distributed/test_events.py
|
- pytest -v -s distributed/test_events.py
|
||||||
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
||||||
@@ -255,9 +248,9 @@ steps:
|
|||||||
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
|
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
|
||||||
|
|
||||||
- label: EPLB Algorithm Test # 5min
|
- label: EPLB Algorithm Test # 5min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
grade: Blocking
|
# grade: Blocking
|
||||||
timeout_in_minutes: 15
|
timeout_in_minutes: 15
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@@ -313,25 +306,28 @@ steps:
|
|||||||
- pytest -v -s test_regression.py
|
- pytest -v -s test_regression.py
|
||||||
working_dir: "/vllm-workspace/tests" # optional
|
working_dir: "/vllm-workspace/tests" # optional
|
||||||
|
|
||||||
- label: Engine Test # 9min
|
- label: Engine Test # 25min
|
||||||
timeout_in_minutes: 15
|
timeout_in_minutes: 40
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/engine
|
- tests/engine
|
||||||
|
- tests/tokenization
|
||||||
- tests/test_sequence
|
- tests/test_sequence
|
||||||
- tests/test_config
|
- tests/test_config
|
||||||
- tests/test_logger
|
- tests/test_logger
|
||||||
- tests/test_vllm_port
|
- tests/test_vllm_port
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
||||||
|
# OOM in the CI unless we run this separately
|
||||||
|
- pytest -v -s tokenization
|
||||||
|
|
||||||
- label: V1 Test e2e + engine # 65min
|
- label: V1 Test e2e + engine # 30min
|
||||||
timeout_in_minutes: 90
|
timeout_in_minutes: 45
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
agent_pool: mi325_4
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
@@ -344,9 +340,9 @@ steps:
|
|||||||
|
|
||||||
- label: V1 Test entrypoints # 35min
|
- label: V1 Test entrypoints # 35min
|
||||||
timeout_in_minutes: 50
|
timeout_in_minutes: 50
|
||||||
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
grade: Blocking
|
# grade: Blocking
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/v1
|
- tests/v1
|
||||||
@@ -394,34 +390,10 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -v -s v1/attention
|
- pytest -v -s v1/attention
|
||||||
|
|
||||||
- label: Batch Invariance Tests (H100) # 10min
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
agent_pool: mi325_1
|
|
||||||
timeout_in_minutes: 25
|
|
||||||
gpu: h100
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/v1/attention
|
|
||||||
- vllm/model_executor/layers
|
|
||||||
- tests/v1/determinism/
|
|
||||||
commands:
|
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
- pip install pytest-timeout pytest-forked
|
|
||||||
- pytest -v -s v1/determinism/test_batch_invariance.py
|
|
||||||
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
|
|
||||||
|
|
||||||
- label: V1 Test attention (B200) # 10min
|
|
||||||
timeout_in_minutes: 30
|
|
||||||
gpu: b200
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/v1/attention
|
|
||||||
- tests/v1/attention
|
|
||||||
commands:
|
|
||||||
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
|
|
||||||
|
|
||||||
- label: V1 Test others (CPU) # 5 mins
|
- label: V1 Test others (CPU) # 5 mins
|
||||||
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
grade: Blocking
|
# grade: Blocking
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/v1
|
- tests/v1
|
||||||
@@ -437,34 +409,29 @@ steps:
|
|||||||
|
|
||||||
- label: Examples Test # 30min
|
- label: Examples Test # 30min
|
||||||
timeout_in_minutes: 45
|
timeout_in_minutes: 45
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
working_dir: "/vllm-workspace/examples"
|
working_dir: "/vllm-workspace/examples"
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/entrypoints
|
- vllm/entrypoints
|
||||||
- vllm/multimodal
|
|
||||||
- examples/
|
- examples/
|
||||||
commands:
|
commands:
|
||||||
- pip install tensorizer # for tensorizer test
|
- pip install tensorizer # for tensorizer test
|
||||||
# for basic
|
|
||||||
- python3 offline_inference/basic/chat.py
|
|
||||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
||||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||||
|
- python3 offline_inference/basic/chat.py
|
||||||
|
- python3 offline_inference/prefix_caching.py
|
||||||
|
- python3 offline_inference/llm_engine_example.py
|
||||||
|
- python3 offline_inference/audio_language.py --seed 0
|
||||||
|
- python3 offline_inference/vision_language.py --seed 0
|
||||||
|
- python3 offline_inference/vision_language_pooling.py --seed 0
|
||||||
|
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||||
|
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||||
|
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||||
- python3 offline_inference/basic/classify.py
|
- python3 offline_inference/basic/classify.py
|
||||||
- python3 offline_inference/basic/embed.py
|
- python3 offline_inference/basic/embed.py
|
||||||
- python3 offline_inference/basic/score.py
|
- python3 offline_inference/basic/score.py
|
||||||
# for multi-modal models
|
|
||||||
- python3 offline_inference/audio_language.py --seed 0
|
|
||||||
- python3 offline_inference/vision_language.py --seed 0
|
|
||||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
|
||||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
|
||||||
# for pooling models
|
|
||||||
- python3 pooling/pooling/vision_language_pooling.py --seed 0
|
|
||||||
# for features demo
|
|
||||||
- python3 offline_inference/prefix_caching.py
|
|
||||||
- python3 offline_inference/llm_engine_example.py
|
|
||||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
|
||||||
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||||
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
||||||
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||||
@@ -518,7 +485,7 @@ steps:
|
|||||||
|
|
||||||
- label: PyTorch Compilation Unit Tests # 15min
|
- label: PyTorch Compilation Unit Tests # 15min
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
torch_nightly: true
|
torch_nightly: true
|
||||||
@@ -526,16 +493,21 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
# Run unit tests defined directly under compile/,
|
- pytest -v -s compile/test_pass_manager.py
|
||||||
# not including subdirectories, which are usually heavier
|
- pytest -v -s compile/test_fusion.py
|
||||||
# tests covered elsewhere.
|
- pytest -v -s compile/test_fusion_attn.py
|
||||||
# Use `find` to launch multiple instances of pytest so that
|
- pytest -v -s compile/test_functionalization.py
|
||||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
||||||
- "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
|
# - pytest -v -s compile/test_sequence_parallelism.py
|
||||||
|
# - pytest -v -s compile/test_async_tp.py
|
||||||
|
- pytest -v -s compile/test_fusion_all_reduce.py
|
||||||
|
- pytest -v -s compile/test_decorator.py
|
||||||
|
- pytest -v -s compile/test_noop_elimination.py
|
||||||
|
- pytest -v -s compile/test_aot_compile.py
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Smoke Test # 15min
|
- label: PyTorch Fullgraph Smoke Test # 15min
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
torch_nightly: true
|
torch_nightly: true
|
||||||
@@ -543,11 +515,9 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
# Run smoke tests under fullgraph directory, except test_full_graph.py
|
- pytest -v -s compile/test_basic_correctness.py
|
||||||
# as it is a heavy test that is covered in other steps.
|
- pytest -v -s compile/test_multimodal_compile.py
|
||||||
# Use `find` to launch multiple instances of pytest so that
|
- pytest -v -s compile/piecewise/
|
||||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
|
||||||
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
|
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Test # 27min
|
- label: PyTorch Fullgraph Test # 27min
|
||||||
timeout_in_minutes: 40
|
timeout_in_minutes: 40
|
||||||
@@ -559,10 +529,10 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
- pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
||||||
# Limit to no custom ops to reduce running time
|
# Limit to no custom ops to reduce running time
|
||||||
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||||
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
- "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
|
||||||
|
|
||||||
- label: Cudagraph test
|
- label: Cudagraph test
|
||||||
timeout_in_minutes: 20
|
timeout_in_minutes: 20
|
||||||
@@ -591,7 +561,7 @@ steps:
|
|||||||
|
|
||||||
- label: Kernels Attention Test %N # 23min
|
- label: Kernels Attention Test %N # 23min
|
||||||
timeout_in_minutes: 35
|
timeout_in_minutes: 35
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
agent_pool: mi325_8
|
agent_pool: mi325_8
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@@ -618,7 +588,7 @@ steps:
|
|||||||
|
|
||||||
- label: Kernels MoE Test %N # 40min
|
- label: Kernels MoE Test %N # 40min
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
agent_pool: mi325_8
|
agent_pool: mi325_8
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@@ -645,26 +615,6 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels/mamba
|
- pytest -v -s kernels/mamba
|
||||||
|
|
||||||
- label: Kernels DeepGEMM Test (H100) # Nvidia-centric
|
|
||||||
# Not replicating for CUTLAS & CuTe
|
|
||||||
timeout_in_minutes: 45
|
|
||||||
gpu: h100
|
|
||||||
num_gpus: 1
|
|
||||||
source_file_dependencies:
|
|
||||||
- tools/install_deepgemm.sh
|
|
||||||
- vllm/utils/deep_gemm.py
|
|
||||||
- vllm/model_executor/layers/fused_moe
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
- tests/kernels/quantization/test_block_fp8.py
|
|
||||||
- tests/kernels/moe/test_deepgemm.py
|
|
||||||
- tests/kernels/moe/test_batched_deepgemm.py
|
|
||||||
- tests/kernels/attention/test_deepgemm_attention.py
|
|
||||||
commands:
|
|
||||||
- pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
|
|
||||||
- pytest -v -s kernels/moe/test_deepgemm.py
|
|
||||||
- pytest -v -s kernels/moe/test_batched_deepgemm.py
|
|
||||||
- pytest -v -s kernels/attention/test_deepgemm_attention.py
|
|
||||||
|
|
||||||
- label: Model Executor Test # 23min
|
- label: Model Executor Test # 23min
|
||||||
timeout_in_minutes: 35
|
timeout_in_minutes: 35
|
||||||
torch_nightly: true
|
torch_nightly: true
|
||||||
@@ -723,18 +673,16 @@ steps:
|
|||||||
# we can only upgrade after this is resolved
|
# we can only upgrade after this is resolved
|
||||||
# TODO(jerryzh168): resolve the above comment
|
# TODO(jerryzh168): resolve the above comment
|
||||||
- uv pip install --system torchao==0.13.0
|
- uv pip install --system torchao==0.13.0
|
||||||
- uv pip install --system conch-triton-kernels
|
|
||||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||||
|
|
||||||
- label: LM Eval Small Models # 53min
|
- label: LM Eval Small Models # 15min
|
||||||
timeout_in_minutes: 75
|
timeout_in_minutes: 20
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
autorun_on_main: true
|
|
||||||
commands:
|
commands:
|
||||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||||
|
|
||||||
@@ -747,9 +695,9 @@ steps:
|
|||||||
- csrc/
|
- csrc/
|
||||||
- vllm/entrypoints/openai/
|
- vllm/entrypoints/openai/
|
||||||
- vllm/model_executor/models/whisper.py
|
- vllm/model_executor/models/whisper.py
|
||||||
commands: # LMEval+Transcription WER check
|
commands: # LMEval
|
||||||
# Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
|
# Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
|
||||||
- pytest -s entrypoints/openai/correctness/
|
- pytest -s entrypoints/openai/correctness/ --ignore entrypoints/openai/correctness/test_transcription_api_correctness.py
|
||||||
|
|
||||||
- label: OpenAI-Compatible Tool Use # 23 min
|
- label: OpenAI-Compatible Tool Use # 23 min
|
||||||
timeout_in_minutes: 35
|
timeout_in_minutes: 35
|
||||||
@@ -761,7 +709,19 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/tool_use
|
- tests/tool_use
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s tool_use
|
- pytest -v -s -m 'not cpu_test' tool_use
|
||||||
|
|
||||||
|
- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
agent_pool: mi325_1
|
||||||
|
# grade: Blocking
|
||||||
|
timeout_in_minutes: 10
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/tool_use
|
||||||
|
no_gpu: true
|
||||||
|
commands:
|
||||||
|
- pytest -v -s -m 'cpu_test' tool_use
|
||||||
|
|
||||||
##### models test #####
|
##### models test #####
|
||||||
|
|
||||||
@@ -786,7 +746,6 @@ steps:
|
|||||||
torch_nightly: true
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/model_executor/models/
|
- vllm/model_executor/models/
|
||||||
- vllm/transformers_utils/
|
|
||||||
- tests/models/test_initialization.py
|
- tests/models/test_initialization.py
|
||||||
commands:
|
commands:
|
||||||
# Only when vLLM model source is modified - test initialization of a large
|
# Only when vLLM model source is modified - test initialization of a large
|
||||||
@@ -932,18 +891,6 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -v -s models/language/pooling_mteb_test
|
- pytest -v -s models/language/pooling_mteb_test
|
||||||
|
|
||||||
- label: Multi-Modal Processor Test (CPU)
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
agent_pool: mi325_1
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/multimodal
|
|
||||||
no_gpu: true
|
|
||||||
commands:
|
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
|
||||||
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
|
|
||||||
|
|
||||||
- label: Multi-Modal Processor Test # 44min
|
- label: Multi-Modal Processor Test # 44min
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
@@ -971,8 +918,8 @@ steps:
|
|||||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||||
|
|
||||||
- label: Multi-Modal Accuracy Eval (Small Models) # 150min - 180min
|
- label: Multi-Modal Accuracy Eval (Small Models) # 10min
|
||||||
timeout_in_minutes: 180
|
timeout_in_minutes: 70
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
@@ -984,8 +931,7 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 1 # 60min
|
- label: Multi-Modal Models Test (Extended) 1
|
||||||
timeout_in_minutes: 120
|
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
@@ -1009,8 +955,7 @@ steps:
|
|||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
|
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 3 # 75min
|
- label: Multi-Modal Models Test (Extended) 3
|
||||||
timeout_in_minutes: 150
|
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
@@ -1053,12 +998,12 @@ steps:
|
|||||||
optional: true
|
optional: true
|
||||||
commands:
|
commands:
|
||||||
- pip install --upgrade git+https://github.com/huggingface/transformers
|
- pip install --upgrade git+https://github.com/huggingface/transformers
|
||||||
- pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
|
- pytest -v -s tests/models/test_initialization.py
|
||||||
- pytest -v -s tests/models/test_transformers.py
|
- pytest -v -s tests/models/test_transformers.py
|
||||||
# - pytest -v -s tests/models/multimodal/processing/
|
- pytest -v -s tests/models/multimodal/processing/
|
||||||
- pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
|
- pytest -v -s tests/models/multimodal/test_mapping.py
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
- python3 examples/offline_inference/basic/chat.py
|
||||||
# - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||||
# Whisper needs spawn method to avoid deadlock
|
# Whisper needs spawn method to avoid deadlock
|
||||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||||
|
|
||||||
@@ -1102,9 +1047,8 @@ steps:
|
|||||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||||
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
|
||||||
|
|
||||||
- label: Blackwell Fusion and Compile Tests # 30 min
|
- label: Blackwell Fusion Tests # 30 min
|
||||||
timeout_in_minutes: 40
|
timeout_in_minutes: 40
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
gpu: b200
|
gpu: b200
|
||||||
@@ -1112,29 +1056,20 @@ steps:
|
|||||||
- csrc/quantization/fp4/
|
- csrc/quantization/fp4/
|
||||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
- vllm/v1/worker/
|
|
||||||
- vllm/v1/cudagraph_dispatcher.py
|
|
||||||
- vllm/compilation/
|
- vllm/compilation/
|
||||||
# can affect pattern matching
|
# can affect pattern matching
|
||||||
- vllm/model_executor/layers/layernorm.py
|
- vllm/model_executor/layers/layernorm.py
|
||||||
- vllm/model_executor/layers/activation.py
|
- vllm/model_executor/layers/activation.py
|
||||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||||
- tests/compile/test_fusion_attn.py
|
|
||||||
- tests/compile/test_silu_mul_quant_fusion.py
|
|
||||||
- tests/compile/distributed/test_fusion_all_reduce.py
|
|
||||||
- tests/compile/distributed/test_fusions_e2e.py
|
|
||||||
- tests/compile/fullgraph/test_full_graph.py
|
|
||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
- pytest -v -s tests/compile/test_fusion_attn.py
|
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||||
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||||
# Wrap with quotes to escape yaml
|
# Wrap with quotes to escape yaml
|
||||||
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
- "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
|
||||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
|
||||||
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
|
||||||
|
|
||||||
- label: Blackwell Fusion E2E Tests # 30 min
|
- label: Blackwell Fusion E2E Tests # 30 min
|
||||||
timeout_in_minutes: 40
|
timeout_in_minutes: 40
|
||||||
@@ -1151,16 +1086,20 @@ steps:
|
|||||||
- vllm/model_executor/layers/layernorm.py
|
- vllm/model_executor/layers/layernorm.py
|
||||||
- vllm/model_executor/layers/activation.py
|
- vllm/model_executor/layers/activation.py
|
||||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||||
- tests/compile/distributed/test_fusions_e2e.py
|
- tests/compile/test_fusions_e2e.py
|
||||||
|
- tests/compile/test_full_graph.py
|
||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
# Run all e2e fusion tests
|
# Run all e2e fusion tests
|
||||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
- pytest -v -s tests/compile/test_fusions_e2e.py
|
||||||
|
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||||
|
- pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
|
||||||
|
|
||||||
- label: Blackwell GPT-OSS Eval
|
- label: ROCm GPT-OSS Eval
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
gpu: b200
|
agent_pool: mi325_1
|
||||||
|
mirror_hardwares: [amdproduction]
|
||||||
optional: true # run on nightlies
|
optional: true # run on nightlies
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- tests/evals/gpt_oss
|
- tests/evals/gpt_oss
|
||||||
@@ -1169,7 +1108,7 @@ steps:
|
|||||||
- vllm/v1/attention/backends/flashinfer.py
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
commands:
|
commands:
|
||||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||||
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||||
|
|
||||||
- label: Blackwell Quantized MoE Test
|
- label: Blackwell Quantized MoE Test
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
@@ -1259,7 +1198,7 @@ steps:
|
|||||||
- vllm/worker/worker_base.py
|
- vllm/worker/worker_base.py
|
||||||
- vllm/v1/engine/
|
- vllm/v1/engine/
|
||||||
- vllm/v1/worker/
|
- vllm/v1/worker/
|
||||||
- tests/compile/fullgraph/test_basic_correctness.py
|
- tests/compile/test_basic_correctness.py
|
||||||
- tests/compile/test_wrapper.py
|
- tests/compile/test_wrapper.py
|
||||||
- tests/distributed/
|
- tests/distributed/
|
||||||
- tests/entrypoints/llm/test_collective_rpc.py
|
- tests/entrypoints/llm/test_collective_rpc.py
|
||||||
@@ -1269,11 +1208,10 @@ steps:
|
|||||||
- tests/v1/worker/test_worker_memory_snapshot.py
|
- tests/v1/worker/test_worker_memory_snapshot.py
|
||||||
commands:
|
commands:
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||||
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py
|
- pytest -v -s ./compile/test_basic_correctness.py
|
||||||
- pytest -v -s ./compile/test_wrapper.py
|
- pytest -v -s ./compile/test_wrapper.py
|
||||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
@@ -1305,7 +1243,7 @@ steps:
|
|||||||
|
|
||||||
- label: Plugin Tests (2 GPUs) # 40min
|
- label: Plugin Tests (2 GPUs) # 40min
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
agent_pool: mi325_2
|
agent_pool: mi325_2
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
@@ -1373,27 +1311,11 @@ steps:
|
|||||||
- pytest -v -s -x lora/test_llama_tp.py
|
- pytest -v -s -x lora/test_llama_tp.py
|
||||||
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
||||||
- pytest -v -s -x lora/test_olmoe_tp.py
|
- pytest -v -s -x lora/test_olmoe_tp.py
|
||||||
|
- pytest -v -s -x lora/test_gptoss_tp.py
|
||||||
# Disabled for now because MXFP4 backend on non-cuda platform
|
|
||||||
# doesn't support LoRA yet
|
|
||||||
#- pytest -v -s -x lora/test_gptoss_tp.py
|
|
||||||
|
|
||||||
|
|
||||||
- label: Weight Loading Multiple GPU Test # 33min
|
- label: Weight Loading Multiple GPU Test # 33min
|
||||||
timeout_in_minutes: 45
|
timeout_in_minutes: 45
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
agent_pool: mi325_2
|
|
||||||
# grade: Blocking
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
num_gpus: 2
|
|
||||||
optional: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/weight_loading
|
|
||||||
commands:
|
|
||||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt
|
|
||||||
|
|
||||||
- label: Weight Loading Multiple GPU Test - Large Models # optional
|
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
agent_pool: mi325_2
|
agent_pool: mi325_2
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
@@ -1404,7 +1326,21 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/weight_loading
|
- tests/weight_loading
|
||||||
commands:
|
commands:
|
||||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
|
||||||
|
|
||||||
|
- label: Weight Loading Multiple GPU Test - Large Models # optional
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
agent_pool: mi325_2
|
||||||
|
# grade: Blocking
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 2
|
||||||
|
gpu: a100
|
||||||
|
optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/weight_loading
|
||||||
|
commands:
|
||||||
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
||||||
|
|
||||||
- label: NixlConnector PD accuracy tests (Distributed) # 30min
|
- label: NixlConnector PD accuracy tests (Distributed) # 30min
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
@@ -1440,13 +1376,12 @@ steps:
|
|||||||
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest -v -s -x lora/test_mixtral.py
|
- pytest -v -s -x lora/test_mixtral.py
|
||||||
|
|
||||||
|
|
||||||
- label: LM Eval Large Models # optional
|
- label: LM Eval Large Models # optional
|
||||||
gpu: a100
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
optional: true
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
agent_pool: mi325_4
|
agent_pool: mi325_4
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
|
gpu: a100
|
||||||
|
optional: true
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@@ -1458,11 +1393,11 @@ steps:
|
|||||||
|
|
||||||
##### H100 test #####
|
##### H100 test #####
|
||||||
- label: LM Eval Large Models (H100) # optional
|
- label: LM Eval Large Models (H100) # optional
|
||||||
gpu: h100
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
optional: true
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
agent_pool: mi325_4
|
agent_pool: mi325_4
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@@ -1472,7 +1407,6 @@ steps:
|
|||||||
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
||||||
|
|
||||||
|
|
||||||
##### H200 test #####
|
##### H200 test #####
|
||||||
- label: Distributed Tests (H200) # optional
|
- label: Distributed Tests (H200) # optional
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
@@ -1483,14 +1417,12 @@ steps:
|
|||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
commands:
|
commands:
|
||||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
|
- pytest -v -s tests/compile/test_async_tp.py
|
||||||
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
- pytest -v -s tests/compile/test_sequence_parallelism.py
|
||||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||||
#- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
||||||
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
|
||||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
- HIP_VISIBLE_DEVICES=0,1 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||||
|
|
||||||
##### B200 test #####
|
##### B200 test #####
|
||||||
@@ -1504,57 +1436,6 @@ steps:
|
|||||||
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
||||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||||
|
|
||||||
##### E2E Eval Tests #####
|
|
||||||
- label: LM Eval Small Models (1 Card) # 15min
|
|
||||||
timeout_in_minutes: 20
|
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
agent_pool: mi325_1
|
|
||||||
# grade: Blocking
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
commands:
|
|
||||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
|
||||||
|
|
||||||
- label: LM Eval Large Models (4 Card)
|
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
agent_pool: mi325_4
|
|
||||||
# grade: Blocking
|
|
||||||
gpu: a100
|
|
||||||
optional: true
|
|
||||||
num_gpus: 4
|
|
||||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
commands:
|
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
|
||||||
|
|
||||||
- label: ROCm LM Eval Large Models (8 Card)
|
|
||||||
mirror_hardwares: [amdproduction]
|
|
||||||
agent_pool: mi325_8
|
|
||||||
num_gpus: 8
|
|
||||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
|
||||||
commands:
|
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8
|
|
||||||
|
|
||||||
- label: ROCm GPT-OSS Eval
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
working_dir: "/vllm-workspace/"
|
|
||||||
agent_pool: mi325_1
|
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
optional: true # run on nightlies
|
|
||||||
source_file_dependencies:
|
|
||||||
- tests/evals/gpt_oss
|
|
||||||
- vllm/model_executor/models/gpt_oss.py
|
|
||||||
- vllm/model_executor/layers/quantization/mxfp4.py
|
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
|
||||||
commands:
|
|
||||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
|
||||||
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
|
||||||
|
|
||||||
##### RL Integration Tests #####
|
##### RL Integration Tests #####
|
||||||
- label: Prime-RL Integration Test # 15min
|
- label: Prime-RL Integration Test # 15min
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
@@ -1569,8 +1450,9 @@ steps:
|
|||||||
- .buildkite/scripts/run-prime-rl-test.sh
|
- .buildkite/scripts/run-prime-rl-test.sh
|
||||||
commands:
|
commands:
|
||||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
- bash .buildkite/scripts/run-prime-rl-test.sh
|
||||||
|
|
||||||
- label: DeepSeek V2-Lite Accuracy
|
- label: DeepSeek V2-Lite Accuracy
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
agent_pool: mi325_4
|
agent_pool: mi325_4
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
@@ -1581,8 +1463,8 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
|
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
|
||||||
|
|
||||||
- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
|
- label: Qwen3-30B-A3B-FP8-block Accuracy
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
agent_pool: mi325_4
|
agent_pool: mi325_4
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
@@ -1591,36 +1473,4 @@ steps:
|
|||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
working_dir: "/vllm-workspace"
|
working_dir: "/vllm-workspace"
|
||||||
commands:
|
commands:
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
|
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
|
||||||
|
|
||||||
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
gpu: b200
|
|
||||||
optional: true
|
|
||||||
num_gpus: 2
|
|
||||||
working_dir: "/vllm-workspace"
|
|
||||||
commands:
|
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
|
||||||
|
|
||||||
- label: DeepSeek V2-Lite Async EPLB Accuracy
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
agent_pool: mi325_4
|
|
||||||
# grade: Blocking
|
|
||||||
gpu: h100
|
|
||||||
optional: true
|
|
||||||
num_gpus: 4
|
|
||||||
working_dir: "/vllm-workspace"
|
|
||||||
commands:
|
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
|
|
||||||
|
|
||||||
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
agent_pool: mi325_4
|
|
||||||
# grade: Blocking
|
|
||||||
optional: true
|
|
||||||
num_gpus: 4
|
|
||||||
working_dir: "/vllm-workspace"
|
|
||||||
commands:
|
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
|
|
||||||
|
|||||||
@@ -57,16 +57,14 @@ steps:
|
|||||||
- pytest -v -s -m 'not cpu_test' multimodal
|
- pytest -v -s -m 'not cpu_test' multimodal
|
||||||
- pytest -v -s utils_
|
- pytest -v -s utils_
|
||||||
|
|
||||||
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
|
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 10
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/test_inputs.py
|
- tests/test_inputs.py
|
||||||
- tests/test_outputs.py
|
- tests/test_outputs.py
|
||||||
- tests/multimodal
|
- tests/multimodal
|
||||||
- tests/standalone_tests/lazy_imports.py
|
- tests/standalone_tests/lazy_imports.py
|
||||||
- tests/tokenizers_
|
|
||||||
- tests/tool_parsers
|
|
||||||
- tests/transformers_utils
|
- tests/transformers_utils
|
||||||
- tests/config
|
- tests/config
|
||||||
no_gpu: true
|
no_gpu: true
|
||||||
@@ -75,8 +73,6 @@ steps:
|
|||||||
- pytest -v -s test_inputs.py
|
- pytest -v -s test_inputs.py
|
||||||
- pytest -v -s test_outputs.py
|
- pytest -v -s test_outputs.py
|
||||||
- pytest -v -s -m 'cpu_test' multimodal
|
- pytest -v -s -m 'cpu_test' multimodal
|
||||||
- pytest -v -s tokenizers_
|
|
||||||
- pytest -v -s tool_parsers
|
|
||||||
- pytest -v -s transformers_utils
|
- pytest -v -s transformers_utils
|
||||||
- pytest -v -s config
|
- pytest -v -s config
|
||||||
|
|
||||||
@@ -171,7 +167,7 @@ steps:
|
|||||||
- tests/distributed/test_utils
|
- tests/distributed/test_utils
|
||||||
- tests/distributed/test_pynccl
|
- tests/distributed/test_pynccl
|
||||||
- tests/distributed/test_events
|
- tests/distributed/test_events
|
||||||
- tests/compile/fullgraph/test_basic_correctness.py
|
- tests/compile/test_basic_correctness
|
||||||
- examples/offline_inference/rlhf.py
|
- examples/offline_inference/rlhf.py
|
||||||
- examples/offline_inference/rlhf_colocate.py
|
- examples/offline_inference/rlhf_colocate.py
|
||||||
- tests/examples/offline_inference/data_parallel.py
|
- tests/examples/offline_inference/data_parallel.py
|
||||||
@@ -196,13 +192,12 @@ steps:
|
|||||||
# test with internal dp
|
# test with internal dp
|
||||||
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
|
||||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
||||||
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
||||||
- pytest -v -s distributed/test_utils.py
|
- pytest -v -s distributed/test_utils.py
|
||||||
- pytest -v -s compile/fullgraph/test_basic_correctness.py
|
- pytest -v -s compile/test_basic_correctness.py
|
||||||
- pytest -v -s distributed/test_pynccl.py
|
- pytest -v -s distributed/test_pynccl.py
|
||||||
- pytest -v -s distributed/test_events.py
|
- pytest -v -s distributed/test_events.py
|
||||||
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
||||||
@@ -280,18 +275,21 @@ steps:
|
|||||||
- pytest -v -s test_regression.py
|
- pytest -v -s test_regression.py
|
||||||
working_dir: "/vllm-workspace/tests" # optional
|
working_dir: "/vllm-workspace/tests" # optional
|
||||||
|
|
||||||
- label: Engine Test # 9min
|
- label: Engine Test # 25min
|
||||||
timeout_in_minutes: 15
|
timeout_in_minutes: 40
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/engine
|
- tests/engine
|
||||||
|
- tests/tokenization
|
||||||
- tests/test_sequence
|
- tests/test_sequence
|
||||||
- tests/test_config
|
- tests/test_config
|
||||||
- tests/test_logger
|
- tests/test_logger
|
||||||
- tests/test_vllm_port
|
- tests/test_vllm_port
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
||||||
|
# OOM in the CI unless we run this separately
|
||||||
|
- pytest -v -s tokenization
|
||||||
|
|
||||||
- label: V1 Test e2e + engine # 30min
|
- label: V1 Test e2e + engine # 30min
|
||||||
timeout_in_minutes: 45
|
timeout_in_minutes: 45
|
||||||
@@ -348,19 +346,6 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -v -s v1/attention
|
- pytest -v -s v1/attention
|
||||||
|
|
||||||
- label: Batch Invariance Tests (H100) # 10min
|
|
||||||
timeout_in_minutes: 25
|
|
||||||
gpu: h100
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/v1/attention
|
|
||||||
- vllm/model_executor/layers
|
|
||||||
- tests/v1/determinism/
|
|
||||||
commands:
|
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
- pip install pytest-timeout pytest-forked
|
|
||||||
- pytest -v -s v1/determinism/test_batch_invariance.py
|
|
||||||
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
|
|
||||||
|
|
||||||
- label: V1 Test attention (B200) # 10min
|
- label: V1 Test attention (B200) # 10min
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
gpu: b200
|
gpu: b200
|
||||||
@@ -390,28 +375,23 @@ steps:
|
|||||||
working_dir: "/vllm-workspace/examples"
|
working_dir: "/vllm-workspace/examples"
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/entrypoints
|
- vllm/entrypoints
|
||||||
- vllm/multimodal
|
|
||||||
- examples/
|
- examples/
|
||||||
commands:
|
commands:
|
||||||
- pip install tensorizer # for tensorizer test
|
- pip install tensorizer # for tensorizer test
|
||||||
# for basic
|
|
||||||
- python3 offline_inference/basic/chat.py
|
|
||||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
||||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||||
|
- python3 offline_inference/basic/chat.py
|
||||||
|
- python3 offline_inference/prefix_caching.py
|
||||||
|
- python3 offline_inference/llm_engine_example.py
|
||||||
|
- python3 offline_inference/audio_language.py --seed 0
|
||||||
|
- python3 offline_inference/vision_language.py --seed 0
|
||||||
|
- python3 offline_inference/vision_language_pooling.py --seed 0
|
||||||
|
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||||
|
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||||
|
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||||
- python3 offline_inference/basic/classify.py
|
- python3 offline_inference/basic/classify.py
|
||||||
- python3 offline_inference/basic/embed.py
|
- python3 offline_inference/basic/embed.py
|
||||||
- python3 offline_inference/basic/score.py
|
- python3 offline_inference/basic/score.py
|
||||||
# for multi-modal models
|
|
||||||
- python3 offline_inference/audio_language.py --seed 0
|
|
||||||
- python3 offline_inference/vision_language.py --seed 0
|
|
||||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
|
||||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
|
||||||
# for pooling models
|
|
||||||
- python3 pooling/pooling/vision_language_pooling.py --seed 0
|
|
||||||
# for features demo
|
|
||||||
- python3 offline_inference/prefix_caching.py
|
|
||||||
- python3 offline_inference/llm_engine_example.py
|
|
||||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
|
||||||
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||||
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
||||||
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||||
@@ -465,14 +445,18 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
# Run unit tests defined directly under compile/,
|
- pytest -v -s compile/test_graph_partition.py
|
||||||
# not including subdirectories, which are usually heavier
|
- pytest -v -s compile/test_config.py
|
||||||
# tests covered elsewhere.
|
- pytest -v -s compile/test_pass_manager.py
|
||||||
# Use `find` to launch multiple instances of pytest so that
|
- pytest -v -s compile/test_fusion.py
|
||||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
- pytest -v -s compile/test_fusion_attn.py
|
||||||
# However, find does not normally propagate error codes, so we combine it with xargs
|
- pytest -v -s compile/test_functionalization.py
|
||||||
# (using -0 for proper path handling)
|
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
||||||
- "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
|
- pytest -v -s compile/test_fusion_all_reduce.py
|
||||||
|
- pytest -v -s compile/test_decorator.py
|
||||||
|
- pytest -v -s compile/test_noop_elimination.py
|
||||||
|
- pytest -v -s compile/test_aot_compile.py
|
||||||
|
- pytest -v -s compile/test_qk_norm_rope_fusion.py
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Smoke Test # 15min
|
- label: PyTorch Fullgraph Smoke Test # 15min
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
@@ -482,13 +466,9 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
# Run smoke tests under fullgraph directory, except test_full_graph.py
|
- pytest -v -s compile/test_basic_correctness.py
|
||||||
# as it is a heavy test that is covered in other steps.
|
- pytest -v -s compile/test_multimodal_compile.py
|
||||||
# Use `find` to launch multiple instances of pytest so that
|
- pytest -v -s compile/piecewise/
|
||||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
|
||||||
# However, find does not normally propagate error codes, so we combine it with xargs
|
|
||||||
# (using -0 for proper path handling)
|
|
||||||
- "find compile/fullgraph -maxdepth 1 -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
|
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Test # 27min
|
- label: PyTorch Fullgraph Test # 27min
|
||||||
timeout_in_minutes: 40
|
timeout_in_minutes: 40
|
||||||
@@ -499,10 +479,10 @@ steps:
|
|||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
# fp8 kv scales not supported on sm89, tested on Blackwell instead
|
# fp8 kv scales not supported on sm89, tested on Blackwell instead
|
||||||
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
- pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
||||||
# Limit to no custom ops to reduce running time
|
# Limit to no custom ops to reduce running time
|
||||||
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||||
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
- "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
||||||
|
|
||||||
- label: Cudagraph test
|
- label: Cudagraph test
|
||||||
timeout_in_minutes: 20
|
timeout_in_minutes: 20
|
||||||
@@ -574,25 +554,6 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels/mamba
|
- pytest -v -s kernels/mamba
|
||||||
|
|
||||||
- label: Kernels DeepGEMM Test (H100)
|
|
||||||
timeout_in_minutes: 45
|
|
||||||
gpu: h100
|
|
||||||
num_gpus: 1
|
|
||||||
source_file_dependencies:
|
|
||||||
- tools/install_deepgemm.sh
|
|
||||||
- vllm/utils/deep_gemm.py
|
|
||||||
- vllm/model_executor/layers/fused_moe
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
- tests/kernels/quantization/test_block_fp8.py
|
|
||||||
- tests/kernels/moe/test_deepgemm.py
|
|
||||||
- tests/kernels/moe/test_batched_deepgemm.py
|
|
||||||
- tests/kernels/attention/test_deepgemm_attention.py
|
|
||||||
commands:
|
|
||||||
- pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
|
|
||||||
- pytest -v -s kernels/moe/test_deepgemm.py
|
|
||||||
- pytest -v -s kernels/moe/test_batched_deepgemm.py
|
|
||||||
- pytest -v -s kernels/attention/test_deepgemm_attention.py
|
|
||||||
|
|
||||||
- label: Model Executor Test # 23min
|
- label: Model Executor Test # 23min
|
||||||
timeout_in_minutes: 35
|
timeout_in_minutes: 35
|
||||||
torch_nightly: true
|
torch_nightly: true
|
||||||
@@ -643,7 +604,6 @@ steps:
|
|||||||
# we can only upgrade after this is resolved
|
# we can only upgrade after this is resolved
|
||||||
# TODO(jerryzh168): resolve the above comment
|
# TODO(jerryzh168): resolve the above comment
|
||||||
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
|
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
|
||||||
- uv pip install --system conch-triton-kernels
|
|
||||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||||
|
|
||||||
- label: LM Eval Small Models # 53min
|
- label: LM Eval Small Models # 53min
|
||||||
@@ -674,7 +634,16 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/tool_use
|
- tests/tool_use
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s tool_use
|
- pytest -v -s -m 'not cpu_test' tool_use
|
||||||
|
|
||||||
|
- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
|
||||||
|
timeout_in_minutes: 10
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/tool_use
|
||||||
|
no_gpu: true
|
||||||
|
commands:
|
||||||
|
- pytest -v -s -m 'cpu_test' tool_use
|
||||||
|
|
||||||
##### models test #####
|
##### models test #####
|
||||||
|
|
||||||
@@ -685,7 +654,6 @@ steps:
|
|||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/test_initialization.py
|
- tests/models/test_initialization.py
|
||||||
- tests/models/registry.py
|
|
||||||
commands:
|
commands:
|
||||||
# Run a subset of model initialization tests
|
# Run a subset of model initialization tests
|
||||||
- pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
|
- pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
|
||||||
@@ -696,9 +664,7 @@ steps:
|
|||||||
torch_nightly: true
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/model_executor/models/
|
- vllm/model_executor/models/
|
||||||
- vllm/transformers_utils/
|
|
||||||
- tests/models/test_initialization.py
|
- tests/models/test_initialization.py
|
||||||
- tests/models/registry.py
|
|
||||||
commands:
|
commands:
|
||||||
# Only when vLLM model source is modified - test initialization of a large
|
# Only when vLLM model source is modified - test initialization of a large
|
||||||
# subset of supported models (the complement of the small subset in the above
|
# subset of supported models (the complement of the small subset in the above
|
||||||
@@ -824,24 +790,14 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -v -s models/language/pooling_mteb_test
|
- pytest -v -s models/language/pooling_mteb_test
|
||||||
|
|
||||||
- label: Multi-Modal Processor Test (CPU)
|
- label: Multi-Modal Processor Test # 44min
|
||||||
timeout_in_minutes: 60
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/multimodal
|
|
||||||
no_gpu: true
|
|
||||||
commands:
|
|
||||||
- "pip install git+https://github.com/TIGER-AI-Lab/Mantis.git || echo 'Mantis installation skipped (decord not available on CPU-only environment)'"
|
|
||||||
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
|
|
||||||
|
|
||||||
- label: Multi-Modal Processor Test
|
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/multimodal
|
- tests/models/multimodal
|
||||||
commands:
|
commands:
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
- pytest -v -s models/multimodal/processing/test_tensor_schema.py
|
- pytest -v -s models/multimodal/processing
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Standard) # 60min
|
- label: Multi-Modal Models Test (Standard) # 60min
|
||||||
timeout_in_minutes: 80
|
timeout_in_minutes: 80
|
||||||
@@ -918,15 +874,14 @@ steps:
|
|||||||
- label: Transformers Nightly Models Test
|
- label: Transformers Nightly Models Test
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
optional: true
|
optional: true
|
||||||
soft_fail: true
|
|
||||||
commands:
|
commands:
|
||||||
- pip install --upgrade git+https://github.com/huggingface/transformers
|
- pip install --upgrade git+https://github.com/huggingface/transformers
|
||||||
- pytest -v -s tests/models/test_initialization.py
|
- pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
|
||||||
- pytest -v -s tests/models/test_transformers.py
|
- pytest -v -s tests/models/test_transformers.py
|
||||||
- pytest -v -s tests/models/multimodal/processing/
|
# - pytest -v -s tests/models/multimodal/processing/
|
||||||
- pytest -v -s tests/models/multimodal/test_mapping.py
|
- pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
- python3 examples/offline_inference/basic/chat.py
|
||||||
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
# - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||||
# Whisper needs spawn method to avoid deadlock
|
# Whisper needs spawn method to avoid deadlock
|
||||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||||
|
|
||||||
@@ -970,7 +925,6 @@ steps:
|
|||||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||||
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
|
||||||
|
|
||||||
- label: Blackwell Fusion and Compile Tests # 30 min
|
- label: Blackwell Fusion and Compile Tests # 30 min
|
||||||
timeout_in_minutes: 40
|
timeout_in_minutes: 40
|
||||||
@@ -980,29 +934,22 @@ steps:
|
|||||||
- csrc/quantization/fp4/
|
- csrc/quantization/fp4/
|
||||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
- vllm/v1/worker/
|
|
||||||
- vllm/v1/cudagraph_dispatcher.py
|
|
||||||
- vllm/compilation/
|
- vllm/compilation/
|
||||||
# can affect pattern matching
|
# can affect pattern matching
|
||||||
- vllm/model_executor/layers/layernorm.py
|
- vllm/model_executor/layers/layernorm.py
|
||||||
- vllm/model_executor/layers/activation.py
|
- vllm/model_executor/layers/activation.py
|
||||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||||
- tests/compile/test_fusion_attn.py
|
|
||||||
- tests/compile/test_silu_mul_quant_fusion.py
|
|
||||||
- tests/compile/distributed/test_fusion_all_reduce.py
|
|
||||||
- tests/compile/distributed/test_fusions_e2e.py
|
|
||||||
- tests/compile/fullgraph/test_full_graph.py
|
|
||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
- pytest -v -s tests/compile/test_fusion_attn.py
|
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||||
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||||
# Wrap with quotes to escape yaml
|
# Wrap with quotes to escape yaml
|
||||||
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
- "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||||
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
- pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
|
||||||
|
|
||||||
- label: Blackwell Fusion E2E Tests # 30 min
|
- label: Blackwell Fusion E2E Tests # 30 min
|
||||||
timeout_in_minutes: 40
|
timeout_in_minutes: 40
|
||||||
@@ -1019,11 +966,12 @@ steps:
|
|||||||
- vllm/model_executor/layers/layernorm.py
|
- vllm/model_executor/layers/layernorm.py
|
||||||
- vllm/model_executor/layers/activation.py
|
- vllm/model_executor/layers/activation.py
|
||||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||||
- tests/compile/distributed/test_fusions_e2e.py
|
- tests/compile/test_fusions_e2e.py
|
||||||
|
- tests/compile/test_full_graph.py
|
||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
# Run all e2e fusion tests
|
# Run all e2e fusion tests
|
||||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
- pytest -v -s tests/compile/test_fusions_e2e.py
|
||||||
|
|
||||||
- label: Blackwell GPT-OSS Eval
|
- label: Blackwell GPT-OSS Eval
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
@@ -1121,7 +1069,7 @@ steps:
|
|||||||
- vllm/worker/worker_base.py
|
- vllm/worker/worker_base.py
|
||||||
- vllm/v1/engine/
|
- vllm/v1/engine/
|
||||||
- vllm/v1/worker/
|
- vllm/v1/worker/
|
||||||
- tests/compile/fullgraph/test_basic_correctness.py
|
- tests/compile/test_basic_correctness.py
|
||||||
- tests/compile/test_wrapper.py
|
- tests/compile/test_wrapper.py
|
||||||
- tests/distributed/
|
- tests/distributed/
|
||||||
- tests/entrypoints/llm/test_collective_rpc.py
|
- tests/entrypoints/llm/test_collective_rpc.py
|
||||||
@@ -1133,11 +1081,10 @@ steps:
|
|||||||
# https://github.com/NVIDIA/nccl/issues/1838
|
# https://github.com/NVIDIA/nccl/issues/1838
|
||||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||||
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py
|
- pytest -v -s ./compile/test_basic_correctness.py
|
||||||
- pytest -v -s ./compile/test_wrapper.py
|
- pytest -v -s ./compile/test_wrapper.py
|
||||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
@@ -1223,8 +1170,6 @@ steps:
|
|||||||
# FIXIT: find out which code initialize cuda before running the test
|
# FIXIT: find out which code initialize cuda before running the test
|
||||||
# before the fix, we need to use spawn to test it
|
# before the fix, we need to use spawn to test it
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
# Alot of these tests are on the edge of OOMing
|
|
||||||
- export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
|
||||||
# There is some Tensor Parallelism related processing logic in LoRA that
|
# There is some Tensor Parallelism related processing logic in LoRA that
|
||||||
# requires multi-GPU testing for validation.
|
# requires multi-GPU testing for validation.
|
||||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||||
@@ -1319,11 +1264,11 @@ steps:
|
|||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
commands:
|
commands:
|
||||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
|
- pytest -v -s tests/compile/test_async_tp.py
|
||||||
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
- pytest -v -s tests/compile/test_sequence_parallelism.py
|
||||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||||
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
- "pytest -v -s tests/compile/test_fusions_e2e.py -k 'not Llama-4'"
|
||||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
- pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||||
@@ -1343,7 +1288,6 @@ steps:
|
|||||||
- label: Prime-RL Integration Test # 15min
|
- label: Prime-RL Integration Test # 15min
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
optional: true
|
optional: true
|
||||||
soft_fail: true
|
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
working_dir: "/vllm-workspace"
|
working_dir: "/vllm-workspace"
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@@ -1361,20 +1305,11 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
|
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
|
||||||
|
|
||||||
- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
|
- label: Qwen3-30B-A3B-FP8-block Accuracy
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
gpu: h100
|
gpu: h100
|
||||||
optional: true
|
optional: true
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
working_dir: "/vllm-workspace"
|
working_dir: "/vllm-workspace"
|
||||||
commands:
|
commands:
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
|
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
|
||||||
|
|
||||||
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
gpu: b200
|
|
||||||
optional: true
|
|
||||||
num_gpus: 2
|
|
||||||
working_dir: "/vllm-workspace"
|
|
||||||
commands:
|
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
|
||||||
|
|||||||
@@ -1,21 +0,0 @@
|
|||||||
group: Attention
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: V1 attention (H100)
|
|
||||||
timeout_in_minutes: 30
|
|
||||||
gpu: h100
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/v1/attention
|
|
||||||
- tests/v1/attention
|
|
||||||
commands:
|
|
||||||
- pytest -v -s v1/attention
|
|
||||||
|
|
||||||
- label: V1 attention (B200)
|
|
||||||
timeout_in_minutes: 30
|
|
||||||
gpu: b200
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/v1/attention
|
|
||||||
- tests/v1/attention
|
|
||||||
commands:
|
|
||||||
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
|
|
||||||
@@ -1,16 +0,0 @@
|
|||||||
group: Basic Correctness
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: Basic Correctness
|
|
||||||
timeout_in_minutes: 30
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/basic_correctness/test_basic_correctness
|
|
||||||
- tests/basic_correctness/test_cpu_offload
|
|
||||||
- tests/basic_correctness/test_cumem.py
|
|
||||||
commands:
|
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
- pytest -v -s basic_correctness/test_cumem.py
|
|
||||||
- pytest -v -s basic_correctness/test_basic_correctness.py
|
|
||||||
- pytest -v -s basic_correctness/test_cpu_offload.py
|
|
||||||
@@ -1,19 +0,0 @@
|
|||||||
group: Benchmarks
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: Benchmarks
|
|
||||||
timeout_in_minutes: 20
|
|
||||||
working_dir: "/vllm-workspace/.buildkite"
|
|
||||||
source_file_dependencies:
|
|
||||||
- benchmarks/
|
|
||||||
commands:
|
|
||||||
- bash scripts/run-benchmarks.sh
|
|
||||||
|
|
||||||
- label: Benchmarks CLI Test
|
|
||||||
timeout_in_minutes: 20
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/benchmarks/
|
|
||||||
commands:
|
|
||||||
- pytest -v -s benchmarks/
|
|
||||||
@@ -1,57 +0,0 @@
|
|||||||
group: Compile
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: Fusion and Compile Tests (B200)
|
|
||||||
timeout_in_minutes: 40
|
|
||||||
working_dir: "/vllm-workspace/"
|
|
||||||
gpu: b200
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/quantization/fp4/
|
|
||||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
|
||||||
- vllm/v1/worker/
|
|
||||||
- vllm/v1/cudagraph_dispatcher.py
|
|
||||||
- vllm/compilation/
|
|
||||||
# can affect pattern matching
|
|
||||||
- vllm/model_executor/layers/layernorm.py
|
|
||||||
- vllm/model_executor/layers/activation.py
|
|
||||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
|
||||||
- tests/compile/test_fusion_attn.py
|
|
||||||
- tests/compile/test_silu_mul_quant_fusion.py
|
|
||||||
- tests/compile/distributed/test_fusion_all_reduce.py
|
|
||||||
- tests/compile/distributed/test_fusions_e2e.py
|
|
||||||
- tests/compile/fullgraph/test_full_graph.py
|
|
||||||
commands:
|
|
||||||
- nvidia-smi
|
|
||||||
- pytest -v -s tests/compile/test_fusion_attn.py
|
|
||||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
|
||||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
|
||||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
|
||||||
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
|
||||||
# Wrap with quotes to escape yaml
|
|
||||||
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
|
||||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
|
||||||
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
|
||||||
|
|
||||||
- label: Fusion E2E (2 GPUs)(B200)
|
|
||||||
timeout_in_minutes: 40
|
|
||||||
working_dir: "/vllm-workspace/"
|
|
||||||
gpu: b200
|
|
||||||
optional: true
|
|
||||||
num_gpus: 2
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/quantization/fp4/
|
|
||||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
|
||||||
- vllm/compilation/
|
|
||||||
# can affect pattern matching
|
|
||||||
- vllm/model_executor/layers/layernorm.py
|
|
||||||
- vllm/model_executor/layers/activation.py
|
|
||||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
|
||||||
- tests/compile/distributed/test_fusions_e2e.py
|
|
||||||
commands:
|
|
||||||
- nvidia-smi
|
|
||||||
# Run all e2e fusion tests
|
|
||||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
|
||||||
|
|
||||||
@@ -1,22 +0,0 @@
|
|||||||
group: CUDA
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: Platform Tests (CUDA)
|
|
||||||
timeout_in_minutes: 15
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/cuda
|
|
||||||
commands:
|
|
||||||
- pytest -v -s cuda/test_cuda_context.py
|
|
||||||
|
|
||||||
- label: Cudagraph
|
|
||||||
timeout_in_minutes: 20
|
|
||||||
source_file_dependencies:
|
|
||||||
- tests/v1/cudagraph
|
|
||||||
- vllm/v1/cudagraph_dispatcher.py
|
|
||||||
- vllm/config/compilation.py
|
|
||||||
- vllm/compilation
|
|
||||||
commands:
|
|
||||||
- pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
|
|
||||||
- pytest -v -s v1/cudagraph/test_cudagraph_mode.py
|
|
||||||
@@ -1,199 +0,0 @@
|
|||||||
group: Distributed
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: Distributed Comm Ops
|
|
||||||
timeout_in_minutes: 20
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
num_gpus: 2
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/distributed
|
|
||||||
- tests/distributed
|
|
||||||
commands:
|
|
||||||
- pytest -v -s distributed/test_comm_ops.py
|
|
||||||
- pytest -v -s distributed/test_shm_broadcast.py
|
|
||||||
- pytest -v -s distributed/test_shm_buffer.py
|
|
||||||
- pytest -v -s distributed/test_shm_storage.py
|
|
||||||
|
|
||||||
- label: Distributed (2 GPUs)
|
|
||||||
timeout_in_minutes: 90
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
num_gpus: 2
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/compilation/
|
|
||||||
- vllm/distributed/
|
|
||||||
- vllm/engine/
|
|
||||||
- vllm/executor/
|
|
||||||
- vllm/worker/worker_base.py
|
|
||||||
- vllm/v1/engine/
|
|
||||||
- vllm/v1/worker/
|
|
||||||
- tests/compile/fullgraph/test_basic_correctness.py
|
|
||||||
- tests/compile/test_wrapper.py
|
|
||||||
- tests/distributed/
|
|
||||||
- tests/entrypoints/llm/test_collective_rpc.py
|
|
||||||
- tests/v1/distributed
|
|
||||||
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
|
||||||
- tests/v1/shutdown
|
|
||||||
- tests/v1/worker/test_worker_memory_snapshot.py
|
|
||||||
commands:
|
|
||||||
# https://github.com/NVIDIA/nccl/issues/1838
|
|
||||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
|
||||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
|
||||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
|
||||||
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py
|
|
||||||
- pytest -v -s ./compile/test_wrapper.py
|
|
||||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
|
||||||
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
|
||||||
- pytest -v -s distributed/test_sequence_parallel.py
|
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
|
||||||
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
|
||||||
|
|
||||||
- label: Distributed Tests (4 GPUs)
|
|
||||||
timeout_in_minutes: 50
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
num_gpus: 4
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/distributed/
|
|
||||||
- tests/distributed/test_utils
|
|
||||||
- tests/distributed/test_pynccl
|
|
||||||
- tests/distributed/test_events
|
|
||||||
- tests/compile/fullgraph/test_basic_correctness.py
|
|
||||||
- examples/offline_inference/rlhf.py
|
|
||||||
- examples/offline_inference/rlhf_colocate.py
|
|
||||||
- tests/examples/offline_inference/data_parallel.py
|
|
||||||
- tests/v1/distributed
|
|
||||||
- tests/v1/engine/test_engine_core_client.py
|
|
||||||
- tests/distributed/test_symm_mem_allreduce.py
|
|
||||||
commands:
|
|
||||||
# https://github.com/NVIDIA/nccl/issues/1838
|
|
||||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
|
||||||
# test with torchrun tp=2 and external_dp=2
|
|
||||||
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
|
||||||
# test with torchrun tp=2 and pp=2
|
|
||||||
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
|
||||||
# test with torchrun tp=4 and dp=1
|
|
||||||
- TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
|
||||||
# test with torchrun tp=2, pp=2 and dp=1
|
|
||||||
- PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
|
||||||
# test with torchrun tp=1 and dp=4 with ep
|
|
||||||
- DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
|
||||||
# test with torchrun tp=2 and dp=2 with ep
|
|
||||||
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
|
||||||
# test with internal dp
|
|
||||||
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
|
||||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
|
|
||||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
|
||||||
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
|
||||||
- pytest -v -s distributed/test_utils.py
|
|
||||||
- pytest -v -s compile/fullgraph/test_basic_correctness.py
|
|
||||||
- pytest -v -s distributed/test_pynccl.py
|
|
||||||
- pytest -v -s distributed/test_events.py
|
|
||||||
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
|
||||||
# TODO: create a dedicated test section for multi-GPU example tests
|
|
||||||
# when we have multiple distributed example tests
|
|
||||||
- cd ../examples/offline_inference
|
|
||||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
|
||||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
|
||||||
|
|
||||||
- label: Distributed Tests (8 GPUs)(H100)
|
|
||||||
timeout_in_minutes: 10
|
|
||||||
gpu: h100
|
|
||||||
num_gpus: 8
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
source_file_dependencies:
|
|
||||||
- examples/offline_inference/torchrun_dp_example.py
|
|
||||||
- vllm/config/parallel.py
|
|
||||||
- vllm/distributed/
|
|
||||||
- vllm/v1/engine/llm_engine.py
|
|
||||||
- vllm/v1/executor/uniproc_executor.py
|
|
||||||
- vllm/v1/worker/gpu_worker.py
|
|
||||||
commands:
|
|
||||||
# https://github.com/NVIDIA/nccl/issues/1838
|
|
||||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
|
||||||
# test with torchrun tp=2 and dp=4 with ep
|
|
||||||
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
|
|
||||||
|
|
||||||
- label: Distributed Tests (4 GPUs)(A100)
|
|
||||||
gpu: a100
|
|
||||||
optional: true
|
|
||||||
num_gpus: 4
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
commands:
|
|
||||||
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
|
||||||
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
|
||||||
- pytest -v -s distributed/test_custom_all_reduce.py
|
|
||||||
- torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
|
|
||||||
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
|
||||||
- pytest -v -s -x lora/test_mixtral.py
|
|
||||||
|
|
||||||
- label: Distributed Tests (2 GPUs)(H200)
|
|
||||||
gpu: h200
|
|
||||||
optional: true
|
|
||||||
working_dir: "/vllm-workspace/"
|
|
||||||
num_gpus: 2
|
|
||||||
commands:
|
|
||||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
|
|
||||||
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
|
||||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
|
||||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
|
|
||||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
|
||||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
|
||||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
|
||||||
|
|
||||||
- label: Distributed Tests (2 GPUs)(B200)
|
|
||||||
gpu: b200
|
|
||||||
optional: true
|
|
||||||
working_dir: "/vllm-workspace/"
|
|
||||||
num_gpus: 2
|
|
||||||
commands:
|
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
|
||||||
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
|
||||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
|
||||||
|
|
||||||
- label: 2 Node Test (4 GPUs)
|
|
||||||
timeout_in_minutes: 30
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
num_gpus: 2
|
|
||||||
num_nodes: 2
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/distributed/
|
|
||||||
- vllm/engine/
|
|
||||||
- vllm/executor/
|
|
||||||
- vllm/model_executor/models/
|
|
||||||
- tests/distributed/
|
|
||||||
- tests/examples/offline_inference/data_parallel.py
|
|
||||||
commands:
|
|
||||||
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
|
|
||||||
|
|
||||||
- label: Distributed NixlConnector PD accuracy (4 GPUs)
|
|
||||||
timeout_in_minutes: 30
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
num_gpus: 4
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
|
||||||
- tests/v1/kv_connector/nixl_integration/
|
|
||||||
commands:
|
|
||||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
|
||||||
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
|
|
||||||
|
|
||||||
- label: Pipeline + Context Parallelism (4 GPUs))
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
num_gpus: 4
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/distributed/
|
|
||||||
- vllm/engine/
|
|
||||||
- vllm/executor/
|
|
||||||
- vllm/model_executor/models/
|
|
||||||
- tests/distributed/
|
|
||||||
commands:
|
|
||||||
- pytest -v -s distributed/test_pp_cudagraph.py
|
|
||||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
|
||||||
@@ -1,59 +0,0 @@
|
|||||||
group: E2E Integration
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: DeepSeek V2-Lite Accuracy
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
gpu: h100
|
|
||||||
optional: true
|
|
||||||
num_gpus: 4
|
|
||||||
working_dir: "/vllm-workspace"
|
|
||||||
commands:
|
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
|
|
||||||
|
|
||||||
- label: Qwen3-30B-A3B-FP8-block Accuracy
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
gpu: h100
|
|
||||||
optional: true
|
|
||||||
num_gpus: 4
|
|
||||||
working_dir: "/vllm-workspace"
|
|
||||||
commands:
|
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
|
|
||||||
|
|
||||||
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
gpu: b200
|
|
||||||
optional: true
|
|
||||||
num_gpus: 2
|
|
||||||
working_dir: "/vllm-workspace"
|
|
||||||
commands:
|
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
|
||||||
|
|
||||||
- label: Prime-RL Integration (2 GPUs)
|
|
||||||
timeout_in_minutes: 30
|
|
||||||
optional: true
|
|
||||||
num_gpus: 2
|
|
||||||
working_dir: "/vllm-workspace"
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- .buildkite/scripts/run-prime-rl-test.sh
|
|
||||||
commands:
|
|
||||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
|
||||||
|
|
||||||
- label: DeepSeek V2-Lite Async EPLB Accuracy
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
gpu: h100
|
|
||||||
optional: true
|
|
||||||
num_gpus: 4
|
|
||||||
working_dir: "/vllm-workspace"
|
|
||||||
commands:
|
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
|
|
||||||
|
|
||||||
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
gpu: h100
|
|
||||||
optional: true
|
|
||||||
num_gpus: 4
|
|
||||||
working_dir: "/vllm-workspace"
|
|
||||||
commands:
|
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
|
|
||||||
@@ -1,26 +0,0 @@
|
|||||||
group: Engine
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: Engine
|
|
||||||
timeout_in_minutes: 15
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/engine
|
|
||||||
- tests/test_sequence
|
|
||||||
- tests/test_config
|
|
||||||
- tests/test_logger
|
|
||||||
- tests/test_vllm_port
|
|
||||||
commands:
|
|
||||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
|
||||||
|
|
||||||
- label: V1 e2e + engine
|
|
||||||
timeout_in_minutes: 45
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/v1
|
|
||||||
commands:
|
|
||||||
# TODO: accuracy does not match, whether setting
|
|
||||||
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
|
||||||
- pytest -v -s v1/e2e
|
|
||||||
- pytest -v -s v1/engine
|
|
||||||
@@ -1,68 +0,0 @@
|
|||||||
group: Entrypoints
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: Entrypoints Unit Tests
|
|
||||||
timeout_in_minutes: 10
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/entrypoints
|
|
||||||
- tests/entrypoints/
|
|
||||||
commands:
|
|
||||||
- pytest -v -s entrypoints/openai/tool_parsers
|
|
||||||
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
|
|
||||||
|
|
||||||
- label: Entrypoints Integration (LLM)
|
|
||||||
timeout_in_minutes: 40
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/entrypoints/llm
|
|
||||||
- tests/entrypoints/offline_mode
|
|
||||||
commands:
|
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
|
||||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
|
||||||
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
|
||||||
|
|
||||||
- label: Entrypoints Integration (API Server)
|
|
||||||
timeout_in_minutes: 130
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/entrypoints/openai
|
|
||||||
- tests/entrypoints/test_chat_utils
|
|
||||||
commands:
|
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
|
|
||||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
|
|
||||||
- pytest -v -s entrypoints/test_chat_utils.py
|
|
||||||
|
|
||||||
|
|
||||||
- label: Entrypoints Integration (Pooling)
|
|
||||||
timeout_in_minutes: 50
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/entrypoints/pooling
|
|
||||||
commands:
|
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
- pytest -v -s entrypoints/pooling
|
|
||||||
|
|
||||||
|
|
||||||
- label: Entrypoints V1
|
|
||||||
timeout_in_minutes: 50
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/v1
|
|
||||||
commands:
|
|
||||||
- pytest -v -s v1/entrypoints
|
|
||||||
|
|
||||||
- label: OpenAI API Correctness
|
|
||||||
timeout_in_minutes: 30
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/
|
|
||||||
- vllm/entrypoints/openai/
|
|
||||||
- vllm/model_executor/models/whisper.py
|
|
||||||
commands: # LMEval+Transcription WER check
|
|
||||||
- pytest -s entrypoints/openai/correctness/
|
|
||||||
@@ -1,23 +0,0 @@
|
|||||||
group: Expert Parallelism
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: EPLB Algorithm
|
|
||||||
timeout_in_minutes: 15
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/distributed/eplb
|
|
||||||
- tests/distributed/test_eplb_algo.py
|
|
||||||
commands:
|
|
||||||
- pytest -v -s distributed/test_eplb_algo.py
|
|
||||||
|
|
||||||
- label: EPLB Execution
|
|
||||||
timeout_in_minutes: 20
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
num_gpus: 4
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/distributed/eplb
|
|
||||||
- tests/distributed/test_eplb_execute.py
|
|
||||||
commands:
|
|
||||||
- pytest -v -s distributed/test_eplb_execute.py
|
|
||||||
- pytest -v -s distributed/test_eplb_spec_decode.py
|
|
||||||
@@ -1,117 +0,0 @@
|
|||||||
group: Kernels
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: Kernels Core Operation Test
|
|
||||||
timeout_in_minutes: 75
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/
|
|
||||||
- tests/kernels/core
|
|
||||||
- tests/kernels/test_top_k_per_row.py
|
|
||||||
commands:
|
|
||||||
- pytest -v -s kernels/core kernels/test_top_k_per_row.py
|
|
||||||
|
|
||||||
- label: Kernels Attention Test %N
|
|
||||||
timeout_in_minutes: 35
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/attention/
|
|
||||||
- vllm/attention
|
|
||||||
- vllm/v1/attention
|
|
||||||
- tests/kernels/attention
|
|
||||||
commands:
|
|
||||||
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
|
||||||
parallelism: 2
|
|
||||||
|
|
||||||
- label: Kernels Quantization Test %N
|
|
||||||
timeout_in_minutes: 90
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/quantization/
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
- tests/kernels/quantization
|
|
||||||
commands:
|
|
||||||
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
|
||||||
parallelism: 2
|
|
||||||
|
|
||||||
- label: Kernels MoE Test %N
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/quantization/cutlass_w8a8/moe/
|
|
||||||
- csrc/moe/
|
|
||||||
- tests/kernels/moe
|
|
||||||
- vllm/model_executor/layers/fused_moe/
|
|
||||||
- vllm/distributed/device_communicators/
|
|
||||||
- vllm/envs.py
|
|
||||||
- vllm/config
|
|
||||||
commands:
|
|
||||||
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
|
||||||
parallelism: 2
|
|
||||||
|
|
||||||
- label: Kernels Mamba Test
|
|
||||||
timeout_in_minutes: 45
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/mamba/
|
|
||||||
- tests/kernels/mamba
|
|
||||||
- vllm/model_executor/layers/mamba/ops
|
|
||||||
commands:
|
|
||||||
- pytest -v -s kernels/mamba
|
|
||||||
|
|
||||||
- label: Kernels DeepGEMM Test (H100)
|
|
||||||
timeout_in_minutes: 45
|
|
||||||
gpu: h100
|
|
||||||
num_gpus: 1
|
|
||||||
source_file_dependencies:
|
|
||||||
- tools/install_deepgemm.sh
|
|
||||||
- vllm/utils/deep_gemm.py
|
|
||||||
- vllm/model_executor/layers/fused_moe
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
- tests/kernels/quantization/test_block_fp8.py
|
|
||||||
- tests/kernels/moe/test_deepgemm.py
|
|
||||||
- tests/kernels/moe/test_batched_deepgemm.py
|
|
||||||
- tests/kernels/attention/test_deepgemm_attention.py
|
|
||||||
commands:
|
|
||||||
- pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
|
|
||||||
- pytest -v -s kernels/moe/test_deepgemm.py
|
|
||||||
- pytest -v -s kernels/moe/test_batched_deepgemm.py
|
|
||||||
- pytest -v -s kernels/attention/test_deepgemm_attention.py
|
|
||||||
|
|
||||||
- label: Kernels (B200)
|
|
||||||
timeout_in_minutes: 30
|
|
||||||
working_dir: "/vllm-workspace/"
|
|
||||||
gpu: b200
|
|
||||||
# optional: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/quantization/fp4/
|
|
||||||
- csrc/attention/mla/
|
|
||||||
- csrc/quantization/cutlass_w8a8/moe/
|
|
||||||
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
|
|
||||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
|
|
||||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
|
||||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
|
||||||
- vllm/v1/attention/backends/mla/cutlass_mla.py
|
|
||||||
- vllm/v1/attention/backends/mla/flashinfer_mla.py
|
|
||||||
- vllm/platforms/cuda.py
|
|
||||||
- vllm/attention/selector.py
|
|
||||||
commands:
|
|
||||||
- nvidia-smi
|
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
|
||||||
# Attention
|
|
||||||
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
|
||||||
- pytest -v -s tests/kernels/attention/test_attention_selector.py
|
|
||||||
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
|
|
||||||
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
|
|
||||||
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
|
|
||||||
- pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
|
|
||||||
# Quantization
|
|
||||||
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
|
|
||||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
|
|
||||||
- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
|
|
||||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
|
||||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
|
||||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
|
||||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
|
|
||||||
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
|
|
||||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
|
||||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
|
||||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
|
||||||
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
|
||||||
@@ -1,46 +0,0 @@
|
|||||||
group: LM Eval
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: LM Eval Small Models
|
|
||||||
timeout_in_minutes: 75
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
autorun_on_main: true
|
|
||||||
commands:
|
|
||||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
|
||||||
|
|
||||||
- label: LM Eval Large Models (4 GPUs)(A100)
|
|
||||||
gpu: a100
|
|
||||||
optional: true
|
|
||||||
num_gpus: 4
|
|
||||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
commands:
|
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
|
||||||
|
|
||||||
- label: LM Eval Large Models (4 GPUs)(H100)
|
|
||||||
gpu: h100
|
|
||||||
optional: true
|
|
||||||
num_gpus: 4
|
|
||||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
commands:
|
|
||||||
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
|
||||||
|
|
||||||
- label: LM Eval Small Models (B200)
|
|
||||||
timeout_in_minutes: 120
|
|
||||||
gpu: b200
|
|
||||||
optional: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
commands:
|
|
||||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
|
|
||||||
@@ -1,31 +0,0 @@
|
|||||||
group: LoRA
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: LoRA %N
|
|
||||||
timeout_in_minutes: 30
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/lora
|
|
||||||
- tests/lora
|
|
||||||
commands:
|
|
||||||
- pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py
|
|
||||||
parallelism: 4
|
|
||||||
|
|
||||||
|
|
||||||
- label: LoRA TP (Distributed)
|
|
||||||
timeout_in_minutes: 30
|
|
||||||
num_gpus: 4
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/lora
|
|
||||||
- tests/lora
|
|
||||||
commands:
|
|
||||||
# FIXIT: find out which code initialize cuda before running the test
|
|
||||||
# before the fix, we need to use spawn to test it
|
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
# There is some Tensor Parallelism related processing logic in LoRA that
|
|
||||||
# requires multi-GPU testing for validation.
|
|
||||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
|
||||||
- pytest -v -s -x lora/test_llama_tp.py
|
|
||||||
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
|
||||||
- pytest -v -s -x lora/test_olmoe_tp.py
|
|
||||||
- pytest -v -s -x lora/test_gptoss_tp.py
|
|
||||||
@@ -1,165 +0,0 @@
|
|||||||
group: Miscellaneous
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: V1 Others
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/v1
|
|
||||||
commands:
|
|
||||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
|
||||||
# split the test to avoid interference
|
|
||||||
- pytest -v -s -m 'not cpu_test' v1/core
|
|
||||||
- pytest -v -s v1/executor
|
|
||||||
- pytest -v -s v1/kv_offload
|
|
||||||
- pytest -v -s v1/sample
|
|
||||||
- pytest -v -s v1/logits_processors
|
|
||||||
- pytest -v -s v1/worker
|
|
||||||
- pytest -v -s v1/spec_decode
|
|
||||||
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
|
|
||||||
- pytest -v -s -m 'not cpu_test' v1/metrics
|
|
||||||
- pytest -v -s v1/test_oracle.py
|
|
||||||
- pytest -v -s v1/test_request.py
|
|
||||||
- pytest -v -s v1/test_outputs.py
|
|
||||||
# Integration test for streaming correctness (requires special branch).
|
|
||||||
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
|
||||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
|
||||||
|
|
||||||
- label: V1 Others (CPU)
|
|
||||||
depends_on: ~
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/v1
|
|
||||||
no_gpu: true
|
|
||||||
commands:
|
|
||||||
# split the test to avoid interference
|
|
||||||
- pytest -v -s -m 'cpu_test' v1/core
|
|
||||||
- pytest -v -s v1/structured_output
|
|
||||||
- pytest -v -s v1/test_serial_utils.py
|
|
||||||
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
|
|
||||||
- pytest -v -s -m 'cpu_test' v1/metrics
|
|
||||||
|
|
||||||
- label: Regression
|
|
||||||
timeout_in_minutes: 20
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/test_regression
|
|
||||||
commands:
|
|
||||||
- pip install modelscope
|
|
||||||
- pytest -v -s test_regression.py
|
|
||||||
working_dir: "/vllm-workspace/tests" # optional
|
|
||||||
|
|
||||||
- label: Examples
|
|
||||||
timeout_in_minutes: 45
|
|
||||||
working_dir: "/vllm-workspace/examples"
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/entrypoints
|
|
||||||
- vllm/multimodal
|
|
||||||
- examples/
|
|
||||||
commands:
|
|
||||||
- pip install tensorizer # for tensorizer test
|
|
||||||
- python3 offline_inference/basic/chat.py # for basic
|
|
||||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
|
||||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
|
||||||
- python3 offline_inference/basic/classify.py
|
|
||||||
- python3 offline_inference/basic/embed.py
|
|
||||||
- python3 offline_inference/basic/score.py
|
|
||||||
# for multi-modal models
|
|
||||||
- python3 offline_inference/audio_language.py --seed 0
|
|
||||||
- python3 offline_inference/vision_language.py --seed 0
|
|
||||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
|
||||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
|
||||||
# for pooling models
|
|
||||||
- python3 pooling/pooling/vision_language_pooling.py --seed 0
|
|
||||||
# for features demo
|
|
||||||
- python3 offline_inference/prefix_caching.py
|
|
||||||
- python3 offline_inference/llm_engine_example.py
|
|
||||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
|
||||||
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
|
||||||
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
|
||||||
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
|
||||||
|
|
||||||
- label: Metrics, Tracing (2 GPUs)
|
|
||||||
timeout_in_minutes: 20
|
|
||||||
num_gpus: 2
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/v1/tracing
|
|
||||||
commands:
|
|
||||||
- "pip install \
|
|
||||||
'opentelemetry-sdk>=1.26.0' \
|
|
||||||
'opentelemetry-api>=1.26.0' \
|
|
||||||
'opentelemetry-exporter-otlp>=1.26.0' \
|
|
||||||
'opentelemetry-semantic-conventions-ai>=0.4.1'"
|
|
||||||
- pytest -v -s v1/tracing
|
|
||||||
|
|
||||||
- label: Python-only Installation
|
|
||||||
depends_on: ~
|
|
||||||
timeout_in_minutes: 20
|
|
||||||
source_file_dependencies:
|
|
||||||
- tests/standalone_tests/python_only_compile.sh
|
|
||||||
- setup.py
|
|
||||||
commands:
|
|
||||||
- bash standalone_tests/python_only_compile.sh
|
|
||||||
|
|
||||||
- label: Async Engine, Inputs, Utils, Worker
|
|
||||||
timeout_in_minutes: 50
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/multimodal
|
|
||||||
- tests/utils_
|
|
||||||
commands:
|
|
||||||
- pytest -v -s -m 'not cpu_test' multimodal
|
|
||||||
- pytest -v -s utils_
|
|
||||||
|
|
||||||
- label: Async Engine, Inputs, Utils, Worker, Config (CPU)
|
|
||||||
depends_on: ~
|
|
||||||
timeout_in_minutes: 30
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/test_inputs.py
|
|
||||||
- tests/test_outputs.py
|
|
||||||
- tests/multimodal
|
|
||||||
- tests/standalone_tests/lazy_imports.py
|
|
||||||
- tests/tokenizers_
|
|
||||||
- tests/tool_parsers
|
|
||||||
- tests/transformers_utils
|
|
||||||
- tests/config
|
|
||||||
no_gpu: true
|
|
||||||
commands:
|
|
||||||
- python3 standalone_tests/lazy_imports.py
|
|
||||||
- pytest -v -s test_inputs.py
|
|
||||||
- pytest -v -s test_outputs.py
|
|
||||||
- pytest -v -s -m 'cpu_test' multimodal
|
|
||||||
- pytest -v -s tokenizers_
|
|
||||||
- pytest -v -s tool_parsers
|
|
||||||
- pytest -v -s transformers_utils
|
|
||||||
- pytest -v -s config
|
|
||||||
|
|
||||||
- label: GPT-OSS Eval (B200)
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
working_dir: "/vllm-workspace/"
|
|
||||||
gpu: b200
|
|
||||||
optional: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- tests/evals/gpt_oss
|
|
||||||
- vllm/model_executor/models/gpt_oss.py
|
|
||||||
- vllm/model_executor/layers/quantization/mxfp4.py
|
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
|
||||||
commands:
|
|
||||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
|
||||||
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
|
||||||
|
|
||||||
- label: Batch Invariance (H100)
|
|
||||||
timeout_in_minutes: 25
|
|
||||||
gpu: h100
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/v1/attention
|
|
||||||
- vllm/model_executor/layers
|
|
||||||
- tests/v1/determinism/
|
|
||||||
commands:
|
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
- pip install pytest-timeout pytest-forked
|
|
||||||
- pytest -v -s v1/determinism/test_batch_invariance.py
|
|
||||||
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
|
|
||||||
@@ -1,17 +0,0 @@
|
|||||||
group: Model Executor
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: Model Executor
|
|
||||||
timeout_in_minutes: 35
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/engine/arg_utils.py
|
|
||||||
- vllm/config/model.py
|
|
||||||
- vllm/model_executor
|
|
||||||
- tests/model_executor
|
|
||||||
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
|
||||||
commands:
|
|
||||||
- apt-get update && apt-get install -y curl libsodium23
|
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
- pytest -v -s model_executor
|
|
||||||
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
|
||||||
@@ -1,62 +0,0 @@
|
|||||||
group: Models - Basic
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: Basic Models Tests (Initialization)
|
|
||||||
timeout_in_minutes: 45
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/test_initialization.py
|
|
||||||
commands:
|
|
||||||
# Run a subset of model initialization tests
|
|
||||||
- pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
|
|
||||||
|
|
||||||
- label: Basic Models Tests (Extra Initialization) %N
|
|
||||||
timeout_in_minutes: 45
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/model_executor/models/
|
|
||||||
- tests/models/test_initialization.py
|
|
||||||
commands:
|
|
||||||
# Only when vLLM model source is modified - test initialization of a large
|
|
||||||
# subset of supported models (the complement of the small subset in the above
|
|
||||||
# test.) Also run if model initialization test file is modified
|
|
||||||
- pytest -v -s models/test_initialization.py -k 'not test_can_initialize_small_subset' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
|
|
||||||
parallelism: 2
|
|
||||||
|
|
||||||
- label: Basic Models Tests (Other)
|
|
||||||
timeout_in_minutes: 45
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/test_transformers.py
|
|
||||||
- tests/models/test_registry.py
|
|
||||||
commands:
|
|
||||||
- pytest -v -s models/test_transformers.py models/test_registry.py
|
|
||||||
|
|
||||||
- label: Basic Models Test (Other CPU) # 5min
|
|
||||||
timeout_in_minutes: 10
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/test_utils.py
|
|
||||||
- tests/models/test_vision.py
|
|
||||||
no_gpu: true
|
|
||||||
commands:
|
|
||||||
- pytest -v -s models/test_utils.py models/test_vision.py
|
|
||||||
|
|
||||||
- label: Transformers Nightly Models
|
|
||||||
working_dir: "/vllm-workspace/"
|
|
||||||
optional: true
|
|
||||||
soft_fail: true
|
|
||||||
commands:
|
|
||||||
- pip install --upgrade git+https://github.com/huggingface/transformers
|
|
||||||
- pytest -v -s tests/models/test_initialization.py
|
|
||||||
- pytest -v -s tests/models/test_transformers.py
|
|
||||||
- pytest -v -s tests/models/multimodal/processing/
|
|
||||||
- pytest -v -s tests/models/multimodal/test_mapping.py
|
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
|
||||||
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
|
||||||
# Whisper needs spawn method to avoid deadlock
|
|
||||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
|
||||||
@@ -1,22 +0,0 @@
|
|||||||
group: Models - Distributed
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: Distributed Model Tests (2 GPUs)
|
|
||||||
timeout_in_minutes: 50
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
num_gpus: 2
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/model_executor/model_loader/sharded_state_loader.py
|
|
||||||
- vllm/model_executor/models/
|
|
||||||
- tests/basic_correctness/
|
|
||||||
- tests/model_executor/model_loader/test_sharded_state_loader.py
|
|
||||||
- tests/models/
|
|
||||||
commands:
|
|
||||||
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
|
|
||||||
# Avoid importing model tests that cause CUDA reinitialization error
|
|
||||||
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
|
||||||
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
|
|
||||||
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
|
|
||||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
|
|
||||||
@@ -1,91 +0,0 @@
|
|||||||
group: Models - Language
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: Language Models Tests (Standard)
|
|
||||||
timeout_in_minutes: 25
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/language
|
|
||||||
commands:
|
|
||||||
# Test standard language models, excluding a subset of slow tests
|
|
||||||
- pip freeze | grep -E 'torch'
|
|
||||||
- pytest -v -s models/language -m 'core_model and (not slow_test)'
|
|
||||||
|
|
||||||
- label: Language Models Tests (Extra Standard) %N
|
|
||||||
timeout_in_minutes: 45
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/model_executor/models/
|
|
||||||
- tests/models/language/pooling/test_embedding.py
|
|
||||||
- tests/models/language/generation/test_common.py
|
|
||||||
- tests/models/language/pooling/test_classification.py
|
|
||||||
commands:
|
|
||||||
# Shard slow subset of standard language models tests. Only run when model
|
|
||||||
# source is modified, or when specified test files are modified
|
|
||||||
- pip freeze | grep -E 'torch'
|
|
||||||
- pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
|
|
||||||
parallelism: 2
|
|
||||||
|
|
||||||
- label: Language Models Tests (Hybrid) %N
|
|
||||||
timeout_in_minutes: 75
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/language/generation
|
|
||||||
commands:
|
|
||||||
# Install fast path packages for testing against transformers
|
|
||||||
# Note: also needed to run plamo2 model in vLLM
|
|
||||||
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
|
|
||||||
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
|
||||||
# Shard hybrid language model tests
|
|
||||||
- pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
|
|
||||||
parallelism: 2
|
|
||||||
|
|
||||||
- label: Language Models Test (Extended Generation) # 80min
|
|
||||||
timeout_in_minutes: 110
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
optional: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/language/generation
|
|
||||||
commands:
|
|
||||||
# Install fast path packages for testing against transformers
|
|
||||||
# Note: also needed to run plamo2 model in vLLM
|
|
||||||
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
|
|
||||||
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
|
||||||
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
|
|
||||||
|
|
||||||
- label: Language Models Test (PPL)
|
|
||||||
timeout_in_minutes: 110
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
optional: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/language/generation_ppl_test
|
|
||||||
commands:
|
|
||||||
- pytest -v -s models/language/generation_ppl_test
|
|
||||||
|
|
||||||
- label: Language Models Test (Extended Pooling) # 36min
|
|
||||||
timeout_in_minutes: 50
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
optional: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/language/pooling
|
|
||||||
commands:
|
|
||||||
- pytest -v -s models/language/pooling -m 'not core_model'
|
|
||||||
|
|
||||||
- label: Language Models Test (MTEB)
|
|
||||||
timeout_in_minutes: 110
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
optional: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/language/pooling_mteb_test
|
|
||||||
commands:
|
|
||||||
- pytest -v -s models/language/pooling_mteb_test
|
|
||||||
@@ -1,79 +0,0 @@
|
|||||||
group: Models - Multimodal
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: Multi-Modal Models (Standard) # 60min
|
|
||||||
timeout_in_minutes: 80
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/multimodal
|
|
||||||
commands:
|
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
|
||||||
- pip freeze | grep -E 'torch'
|
|
||||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
|
||||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
|
||||||
|
|
||||||
- label: Multi-Modal Processor Test (CPU)
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/multimodal
|
|
||||||
no_gpu: true
|
|
||||||
commands:
|
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
|
||||||
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
|
|
||||||
|
|
||||||
- label: Multi-Modal Processor # 44min
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/multimodal
|
|
||||||
commands:
|
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
|
||||||
- pytest -v -s models/multimodal/processing/test_tensor_schema.py
|
|
||||||
|
|
||||||
- label: Multi-Modal Accuracy Eval (Small Models) # 50min
|
|
||||||
timeout_in_minutes: 70
|
|
||||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/multimodal/
|
|
||||||
- vllm/inputs/
|
|
||||||
- vllm/v1/core/
|
|
||||||
commands:
|
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
|
||||||
|
|
||||||
- label: Multi-Modal Models (Extended) 1
|
|
||||||
optional: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/multimodal
|
|
||||||
commands:
|
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
|
||||||
- pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
|
|
||||||
|
|
||||||
- label: Multi-Modal Models (Extended) 2
|
|
||||||
optional: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/multimodal
|
|
||||||
commands:
|
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
|
||||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
|
|
||||||
|
|
||||||
- label: Multi-Modal Models (Extended) 3
|
|
||||||
optional: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/multimodal
|
|
||||||
commands:
|
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
|
||||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
|
|
||||||
|
|
||||||
# This test is used only in PR development phase to test individual models and should never run on main
|
|
||||||
- label: Custom Models
|
|
||||||
optional: true
|
|
||||||
commands:
|
|
||||||
- echo 'Testing custom models...'
|
|
||||||
# PR authors can temporarily add commands below to test individual models
|
|
||||||
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
|
|
||||||
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
|
|
||||||
@@ -1,34 +0,0 @@
|
|||||||
group: Plugins
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: Plugin Tests (2 GPUs)
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
num_gpus: 2
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/plugins/
|
|
||||||
- tests/plugins/
|
|
||||||
commands:
|
|
||||||
# begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
|
|
||||||
- pip install -e ./plugins/vllm_add_dummy_platform
|
|
||||||
- pytest -v -s plugins_tests/test_platform_plugins.py
|
|
||||||
- pip uninstall vllm_add_dummy_platform -y
|
|
||||||
# end platform plugin tests
|
|
||||||
# begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
|
|
||||||
- pip install -e ./plugins/prithvi_io_processor_plugin
|
|
||||||
- pytest -v -s plugins_tests/test_io_processor_plugins.py
|
|
||||||
- pip uninstall prithvi_io_processor_plugin -y
|
|
||||||
# end io_processor plugins test
|
|
||||||
# begin stat_logger plugins test
|
|
||||||
- pip install -e ./plugins/vllm_add_dummy_stat_logger
|
|
||||||
- pytest -v -s plugins_tests/test_stats_logger_plugins.py
|
|
||||||
- pip uninstall dummy_stat_logger -y
|
|
||||||
# end stat_logger plugins test
|
|
||||||
# other tests continue here:
|
|
||||||
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
|
||||||
- pip install -e ./plugins/vllm_add_dummy_model
|
|
||||||
- pytest -v -s distributed/test_distributed_oot.py
|
|
||||||
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
|
||||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
|
||||||
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
|
|
||||||
@@ -1,50 +0,0 @@
|
|||||||
group: PyTorch
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: PyTorch Compilation Unit Tests
|
|
||||||
timeout_in_minutes: 30
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/compile
|
|
||||||
commands:
|
|
||||||
# Run unit tests defined directly under compile/,
|
|
||||||
# not including subdirectories, which are usually heavier
|
|
||||||
# tests covered elsewhere.
|
|
||||||
# Use `find` to launch multiple instances of pytest so that
|
|
||||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
|
||||||
- "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\;"
|
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Smoke Test
|
|
||||||
timeout_in_minutes: 30
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/compile
|
|
||||||
commands:
|
|
||||||
# Run smoke tests under fullgraph directory, except test_full_graph.py
|
|
||||||
# as it is a heavy test that is covered in other steps.
|
|
||||||
# Use `find` to launch multiple instances of pytest so that
|
|
||||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
|
||||||
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"
|
|
||||||
|
|
||||||
- label: PyTorch Fullgraph
|
|
||||||
timeout_in_minutes: 40
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/compile
|
|
||||||
commands:
|
|
||||||
# fp8 kv scales not supported on sm89, tested on Blackwell instead
|
|
||||||
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
|
||||||
# Limit to no custom ops to reduce running time
|
|
||||||
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
|
||||||
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
|
||||||
|
|
||||||
- label: Pytorch Nightly Dependency Override Check # 2min
|
|
||||||
# if this test fails, it means the nightly torch version is not compatible with some
|
|
||||||
# of the dependencies. Please check the error message and add the package to whitelist
|
|
||||||
# in /vllm/tools/pre_commit/generate_nightly_torch_test.py
|
|
||||||
soft_fail: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- requirements/nightly_torch_test.txt
|
|
||||||
commands:
|
|
||||||
- bash standalone_tests/pytorch_nightly_dependency.sh
|
|
||||||
@@ -1,46 +0,0 @@
|
|||||||
group: Quantization
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: Quantization
|
|
||||||
timeout_in_minutes: 90
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
- tests/quantization
|
|
||||||
commands:
|
|
||||||
# temporary install here since we need nightly, will move to requirements/test.in
|
|
||||||
# after torchao 0.12 release, and pin a working version of torchao nightly here
|
|
||||||
|
|
||||||
# since torchao nightly is only compatible with torch nightly currently
|
|
||||||
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
|
||||||
# we can only upgrade after this is resolved
|
|
||||||
# TODO(jerryzh168): resolve the above comment
|
|
||||||
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
|
|
||||||
- uv pip install --system conch-triton-kernels
|
|
||||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
|
||||||
|
|
||||||
- label: Quantized MoE Test (B200)
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
working_dir: "/vllm-workspace/"
|
|
||||||
gpu: b200
|
|
||||||
source_file_dependencies:
|
|
||||||
- tests/quantization/test_blackwell_moe.py
|
|
||||||
- vllm/model_executor/models/deepseek_v2.py
|
|
||||||
- vllm/model_executor/models/gpt_oss.py
|
|
||||||
- vllm/model_executor/models/llama4.py
|
|
||||||
- vllm/model_executor/layers/fused_moe
|
|
||||||
- vllm/model_executor/layers/quantization/compressed_tensors
|
|
||||||
- vllm/model_executor/layers/quantization/modelopt.py
|
|
||||||
- vllm/model_executor/layers/quantization/mxfp4.py
|
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
|
||||||
commands:
|
|
||||||
- pytest -s -v tests/quantization/test_blackwell_moe.py
|
|
||||||
|
|
||||||
- label: Quantized Models Test
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
- tests/models/quantization
|
|
||||||
commands:
|
|
||||||
- pytest -v -s models/quantization
|
|
||||||
@@ -1,14 +0,0 @@
|
|||||||
group: Samplers
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: Samplers Test
|
|
||||||
timeout_in_minutes: 75
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/model_executor/layers
|
|
||||||
- vllm/sampling_metadata.py
|
|
||||||
- tests/samplers
|
|
||||||
- tests/conftest.py
|
|
||||||
commands:
|
|
||||||
- pytest -v -s samplers
|
|
||||||
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
|
||||||
@@ -1,13 +0,0 @@
|
|||||||
group: Tool use
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: OpenAI-Compatible Tool Use
|
|
||||||
timeout_in_minutes: 35
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
fast_check: false
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/tool_use
|
|
||||||
commands:
|
|
||||||
- pytest -v -s tool_use
|
|
||||||
@@ -1,25 +0,0 @@
|
|||||||
group: Weight Loading
|
|
||||||
depends_on:
|
|
||||||
- image-build
|
|
||||||
steps:
|
|
||||||
- label: Weight Loading Multiple GPU # 33min
|
|
||||||
timeout_in_minutes: 45
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
num_gpus: 2
|
|
||||||
optional: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/weight_loading
|
|
||||||
commands:
|
|
||||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
|
|
||||||
|
|
||||||
- label: Weight Loading Multiple GPU - Large Models # optional
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
num_gpus: 2
|
|
||||||
gpu: a100
|
|
||||||
optional: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/weight_loading
|
|
||||||
commands:
|
|
||||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
|
||||||
8
.github/CODEOWNERS
vendored
8
.github/CODEOWNERS
vendored
@@ -9,7 +9,6 @@
|
|||||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
|
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
|
||||||
/vllm/model_executor/layers/mamba @tdoublep
|
/vllm/model_executor/layers/mamba @tdoublep
|
||||||
/vllm/model_executor/model_loader @22quinn
|
/vllm/model_executor/model_loader @22quinn
|
||||||
/vllm/model_executor/layers/batch_invariant.py @yewentao256
|
|
||||||
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
|
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
|
||||||
/vllm/vllm_flash_attn @LucasWilkinson
|
/vllm/vllm_flash_attn @LucasWilkinson
|
||||||
/vllm/lora @jeejeelee
|
/vllm/lora @jeejeelee
|
||||||
@@ -36,9 +35,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
|||||||
/vllm/v1/kv_cache_interface.py @heheda12345
|
/vllm/v1/kv_cache_interface.py @heheda12345
|
||||||
/vllm/v1/offloading @ApostaC
|
/vllm/v1/offloading @ApostaC
|
||||||
|
|
||||||
# Model runner V2
|
|
||||||
/vllm/v1/worker/gpu @WoosukKwon
|
|
||||||
|
|
||||||
# Test ownership
|
# Test ownership
|
||||||
/.buildkite/lm-eval-harness @mgoin
|
/.buildkite/lm-eval-harness @mgoin
|
||||||
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
||||||
@@ -60,7 +56,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
|||||||
/tests/v1/kv_connector/nixl_integration @NickLucche
|
/tests/v1/kv_connector/nixl_integration @NickLucche
|
||||||
/tests/v1/kv_connector @ApostaC
|
/tests/v1/kv_connector @ApostaC
|
||||||
/tests/v1/offloading @ApostaC
|
/tests/v1/offloading @ApostaC
|
||||||
/tests/v1/determinism @yewentao256
|
|
||||||
|
|
||||||
# Transformers modeling backend
|
# Transformers modeling backend
|
||||||
/vllm/model_executor/models/transformers @hmellor
|
/vllm/model_executor/models/transformers @hmellor
|
||||||
@@ -146,10 +141,9 @@ mkdocs.yaml @hmellor
|
|||||||
/requirements/kv_connectors.txt @NickLucche
|
/requirements/kv_connectors.txt @NickLucche
|
||||||
|
|
||||||
# Pooling models
|
# Pooling models
|
||||||
/examples/pooling @noooop
|
/examples/*/pooling/ @noooop
|
||||||
/tests/models/*/pooling* @noooop
|
/tests/models/*/pooling* @noooop
|
||||||
/tests/entrypoints/pooling @noooop
|
/tests/entrypoints/pooling @noooop
|
||||||
/vllm/entrypoints/pooling @noooop
|
|
||||||
/vllm/config/pooler.py @noooop
|
/vllm/config/pooler.py @noooop
|
||||||
/vllm/pooling_params.py @noooop
|
/vllm/pooling_params.py @noooop
|
||||||
/vllm/model_executor/layers/pooler.py @noooop
|
/vllm/model_executor/layers/pooler.py @noooop
|
||||||
|
|||||||
48
.github/mergify.yml
vendored
48
.github/mergify.yml
vendored
@@ -14,52 +14,6 @@ pull_request_rules:
|
|||||||
comment:
|
comment:
|
||||||
message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
|
message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
|
||||||
|
|
||||||
- name: comment-pre-commit-failure
|
|
||||||
description: Comment on PR when pre-commit check fails
|
|
||||||
conditions:
|
|
||||||
- status-failure=pre-commit
|
|
||||||
- -closed
|
|
||||||
- -draft
|
|
||||||
actions:
|
|
||||||
comment:
|
|
||||||
message: |
|
|
||||||
Hi @{{author}}, the pre-commit checks have failed. Please run:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
uv pip install pre-commit
|
|
||||||
pre-commit install
|
|
||||||
pre-commit run --all-files
|
|
||||||
```
|
|
||||||
|
|
||||||
Then, commit the changes and push to your branch.
|
|
||||||
|
|
||||||
For future commits, `pre-commit` will run automatically on changed files before each commit.
|
|
||||||
|
|
||||||
> [!TIP]
|
|
||||||
> <details>
|
|
||||||
> <summary>Is <code>mypy</code> or <code>markdownlint</code> failing?</summary>
|
|
||||||
> <br/>
|
|
||||||
> <code>mypy</code> and <code>markdownlint</code> are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally:
|
|
||||||
>
|
|
||||||
> ```bash
|
|
||||||
> # For mypy (substitute "3.10" with the failing version if needed)
|
|
||||||
> pre-commit run --hook-stage manual mypy-3.10
|
|
||||||
> # For markdownlint
|
|
||||||
> pre-commit run --hook-stage manual markdownlint
|
|
||||||
> ```
|
|
||||||
> </details>
|
|
||||||
|
|
||||||
- name: comment-dco-failure
|
|
||||||
description: Comment on PR when DCO check fails
|
|
||||||
conditions:
|
|
||||||
- status-failure=dco
|
|
||||||
- -closed
|
|
||||||
- -draft
|
|
||||||
actions:
|
|
||||||
comment:
|
|
||||||
message: |
|
|
||||||
Hi @{{author}}, the DCO check has failed. Please click on DCO in the Checks section for instructions on how to resolve this.
|
|
||||||
|
|
||||||
- name: label-ci-build
|
- name: label-ci-build
|
||||||
description: Automatically apply ci/build label
|
description: Automatically apply ci/build label
|
||||||
conditions:
|
conditions:
|
||||||
@@ -186,7 +140,7 @@ pull_request_rules:
|
|||||||
- files~=^tests/entrypoints/test_context.py
|
- files~=^tests/entrypoints/test_context.py
|
||||||
- files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
|
- files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
|
||||||
- files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
|
- files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
|
||||||
- files~=^vllm/entrypoints/openai/parser/harmony_utils.py
|
- files~=^vllm/entrypoints/harmony_utils.py
|
||||||
- files~=^vllm/entrypoints/tool_server.py
|
- files~=^vllm/entrypoints/tool_server.py
|
||||||
- files~=^vllm/entrypoints/tool.py
|
- files~=^vllm/entrypoints/tool.py
|
||||||
- files~=^vllm/entrypoints/context.py
|
- files~=^vllm/entrypoints/context.py
|
||||||
|
|||||||
4
.github/workflows/cleanup_pr_body.yml
vendored
4
.github/workflows/cleanup_pr_body.yml
vendored
@@ -13,10 +13,10 @@ jobs:
|
|||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
|
uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
|
||||||
with:
|
with:
|
||||||
python-version: '3.12'
|
python-version: '3.12'
|
||||||
|
|
||||||
|
|||||||
25
.github/workflows/issue_autolabel.yml
vendored
25
.github/workflows/issue_autolabel.yml
vendored
@@ -105,31 +105,6 @@ jobs:
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
cpu: {
|
|
||||||
// Keyword search - matches whole words only (with word boundaries)
|
|
||||||
keywords: [
|
|
||||||
{
|
|
||||||
term: "CPU Backend",
|
|
||||||
searchIn: "title"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
term: "x86",
|
|
||||||
searchIn: "title"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
term: "ARM",
|
|
||||||
searchIn: "title"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
term: "Apple Silicon",
|
|
||||||
searchIn: "title"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
term: "IBM Z",
|
|
||||||
searchIn: "title"
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
// Add more label configurations here as needed
|
// Add more label configurations here as needed
|
||||||
// example: {
|
// example: {
|
||||||
// keywords: [...],
|
// keywords: [...],
|
||||||
|
|||||||
9
.github/workflows/macos-smoke-test.yml
vendored
9
.github/workflows/macos-smoke-test.yml
vendored
@@ -9,10 +9,10 @@ on:
|
|||||||
jobs:
|
jobs:
|
||||||
macos-m1-smoke-test:
|
macos-m1-smoke-test:
|
||||||
runs-on: macos-latest
|
runs-on: macos-latest
|
||||||
timeout-minutes: 30
|
timeout-minutes: 20
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v6.0.1
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
- uses: astral-sh/setup-uv@v7
|
- uses: astral-sh/setup-uv@v7
|
||||||
with:
|
with:
|
||||||
@@ -37,14 +37,15 @@ jobs:
|
|||||||
- name: Verify installation
|
- name: Verify installation
|
||||||
run: |
|
run: |
|
||||||
python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
|
python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
|
||||||
|
python -c "import torch; print(f'PyTorch: {torch.__version__}')"
|
||||||
|
|
||||||
- name: Smoke test vllm serve
|
- name: Smoke test vllm serve
|
||||||
|
timeout-minutes: 10
|
||||||
run: |
|
run: |
|
||||||
# Start server in background
|
# Start server in background
|
||||||
vllm serve Qwen/Qwen3-0.6B \
|
vllm serve Qwen/Qwen3-0.6B \
|
||||||
--max-model-len=2K \
|
--max-model-len=2048 \
|
||||||
--load-format=dummy \
|
--load-format=dummy \
|
||||||
--hf-overrides '{"num_hidden_layers": 2}' \
|
|
||||||
--enforce-eager \
|
--enforce-eager \
|
||||||
--port 8000 &
|
--port 8000 &
|
||||||
|
|
||||||
|
|||||||
4
.github/workflows/pre-commit.yml
vendored
4
.github/workflows/pre-commit.yml
vendored
@@ -16,8 +16,8 @@ jobs:
|
|||||||
pre-commit:
|
pre-commit:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
|
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
- uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
|
- uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
|
||||||
with:
|
with:
|
||||||
python-version: "3.12"
|
python-version: "3.12"
|
||||||
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
|
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
|
||||||
|
|||||||
4
.github/workflows/stale.yml
vendored
4
.github/workflows/stale.yml
vendored
@@ -7,15 +7,13 @@ on:
|
|||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
close-issues-and-pull-requests:
|
close-issues-and-pull-requests:
|
||||||
# Prevents triggering on forks or other repos
|
|
||||||
if: github.repository == 'vllm-project/vllm'
|
|
||||||
permissions:
|
permissions:
|
||||||
issues: write
|
issues: write
|
||||||
pull-requests: write
|
pull-requests: write
|
||||||
actions: write
|
actions: write
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1
|
- uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
|
||||||
with:
|
with:
|
||||||
# Increasing this value ensures that changes to this workflow
|
# Increasing this value ensures that changes to this workflow
|
||||||
# propagate to all issues and PRs in days rather than months
|
# propagate to all issues and PRs in days rather than months
|
||||||
|
|||||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -4,9 +4,6 @@
|
|||||||
# vllm-flash-attn built from source
|
# vllm-flash-attn built from source
|
||||||
vllm/vllm_flash_attn/*
|
vllm/vllm_flash_attn/*
|
||||||
|
|
||||||
# OpenAI triton kernels copied from source
|
|
||||||
vllm/third_party/triton_kernels/*
|
|
||||||
|
|
||||||
# triton jit
|
# triton jit
|
||||||
.triton
|
.triton
|
||||||
|
|
||||||
|
|||||||
126
CMakeLists.txt
126
CMakeLists.txt
@@ -136,7 +136,7 @@ elseif(HIP_FOUND)
|
|||||||
|
|
||||||
# ROCm 5.X and 6.X
|
# ROCm 5.X and 6.X
|
||||||
if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
|
if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
|
||||||
Torch_VERSION VERSION_LESS ${TORCH_SUPPORTED_VERSION_ROCM})
|
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
|
||||||
message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
|
message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
|
||||||
"expected for ROCm build, saw ${Torch_VERSION} instead.")
|
"expected for ROCm build, saw ${Torch_VERSION} instead.")
|
||||||
endif()
|
endif()
|
||||||
@@ -307,7 +307,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
||||||
|
|
||||||
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
|
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
|
||||||
set(CUTLASS_REVISION "v4.2.1")
|
set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use")
|
||||||
|
|
||||||
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
||||||
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
||||||
@@ -354,17 +354,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# Only build Marlin kernels if we are building for at least some compatible archs.
|
# Only build Marlin kernels if we are building for at least some compatible archs.
|
||||||
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
|
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
|
||||||
# are not supported by Machete yet.
|
# are not supported by Machete yet.
|
||||||
|
# 9.0 for latest bf16 atomicAdd PTX
|
||||||
# marlin arches for fp16 output
|
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
|
|
||||||
# marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX)
|
|
||||||
cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
|
|
||||||
# marlin arches for fp8 input
|
|
||||||
# - sm80 doesn't support fp8 computation
|
|
||||||
# - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
|
|
||||||
# so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0)
|
|
||||||
cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
|
|
||||||
|
|
||||||
if (MARLIN_ARCHS)
|
if (MARLIN_ARCHS)
|
||||||
|
|
||||||
#
|
#
|
||||||
@@ -374,18 +365,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
set(MARLIN_GEN_SCRIPT
|
set(MARLIN_GEN_SCRIPT
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
|
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
|
||||||
file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
|
file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
|
||||||
list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
|
|
||||||
set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
|
|
||||||
|
|
||||||
message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
|
message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
|
||||||
message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
|
message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
|
||||||
|
|
||||||
if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
|
if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
|
||||||
OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
|
OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
|
||||||
execute_process(
|
execute_process(
|
||||||
COMMAND ${CMAKE_COMMAND} -E env
|
COMMAND ${CMAKE_COMMAND} -E env
|
||||||
PYTHONPATH=$ENV{PYTHONPATH}
|
PYTHONPATH=$PYTHONPATH
|
||||||
${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
|
${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
|
||||||
RESULT_VARIABLE marlin_generation_result
|
RESULT_VARIABLE marlin_generation_result
|
||||||
OUTPUT_VARIABLE marlin_generation_result
|
OUTPUT_VARIABLE marlin_generation_result
|
||||||
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
|
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
|
||||||
@@ -398,15 +387,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
"\nCheck the log for details: "
|
"\nCheck the log for details: "
|
||||||
"${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
|
"${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
|
||||||
else()
|
else()
|
||||||
set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
|
set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
|
||||||
CACHE STRING "Last run Marlin generate script hash and arch" FORCE)
|
CACHE STRING "Last run Marlin generate script hash" FORCE)
|
||||||
message(STATUS "Marlin generation completed successfully.")
|
message(STATUS "Marlin generation completed successfully.")
|
||||||
endif()
|
endif()
|
||||||
else()
|
else()
|
||||||
message(STATUS "Marlin generation script has not changed, skipping generation.")
|
message(STATUS "Marlin generation script has not changed, skipping generation.")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
|
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
|
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
|
||||||
CUDA_ARCHS "${MARLIN_ARCHS}")
|
CUDA_ARCHS "${MARLIN_ARCHS}")
|
||||||
@@ -414,34 +403,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
|
set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
|
||||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
|
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
|
||||||
|
|
||||||
file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
|
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
|
|
||||||
CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
|
||||||
set_source_files_properties(${MARLIN_TEMPLATE_BF16_KERNEL_SRC}
|
|
||||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
|
||||||
endif()
|
|
||||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
|
|
||||||
|
|
||||||
if (MARLIN_FP8_ARCHS)
|
|
||||||
file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
|
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
|
|
||||||
CUDA_ARCHS "${MARLIN_FP8_ARCHS}")
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
|
||||||
set_source_files_properties(${MARLIN_TEMPLATE_FP8_KERNEL_SRC}
|
|
||||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
|
||||||
endif()
|
|
||||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_FP8_KERNEL_SRC})
|
|
||||||
endif()
|
|
||||||
|
|
||||||
set(MARLIN_SRCS
|
set(MARLIN_SRCS
|
||||||
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
||||||
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
|
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
|
||||||
"csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu"
|
|
||||||
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
|
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
|
||||||
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
|
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
@@ -637,15 +604,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
||||||
"csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
|
"csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_experts_quant.cu"
|
"csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
|
||||||
"csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu"
|
|
||||||
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${FP4_ARCHS}")
|
CUDA_ARCHS "${FP4_ARCHS}")
|
||||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
|
||||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM120=1")
|
|
||||||
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
|
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
|
||||||
else()
|
else()
|
||||||
message(STATUS "Not building NVFP4 as no compatible archs were found.")
|
message(STATUS "Not building NVFP4 as no compatible archs were found.")
|
||||||
@@ -822,7 +786,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
|
OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
|
||||||
execute_process(
|
execute_process(
|
||||||
COMMAND ${CMAKE_COMMAND} -E env
|
COMMAND ${CMAKE_COMMAND} -E env
|
||||||
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$ENV{PYTHONPATH}
|
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
|
||||||
${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
|
${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
|
||||||
RESULT_VARIABLE machete_generation_result
|
RESULT_VARIABLE machete_generation_result
|
||||||
OUTPUT_VARIABLE machete_generation_output
|
OUTPUT_VARIABLE machete_generation_output
|
||||||
@@ -874,10 +838,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu"
|
"csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu")
|
||||||
"csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu"
|
|
||||||
"csrc/quantization/cutlass_w4a8/w4a8_utils.cu"
|
|
||||||
)
|
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
@@ -947,6 +908,7 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
|
|||||||
set(VLLM_MOE_EXT_SRC
|
set(VLLM_MOE_EXT_SRC
|
||||||
"csrc/moe/torch_bindings.cpp"
|
"csrc/moe/torch_bindings.cpp"
|
||||||
"csrc/moe/moe_align_sum_kernels.cu"
|
"csrc/moe/moe_align_sum_kernels.cu"
|
||||||
|
"csrc/moe/moe_lora_align_sum_kernels.cu"
|
||||||
"csrc/moe/topk_softmax_kernels.cu")
|
"csrc/moe/topk_softmax_kernels.cu")
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
@@ -976,15 +938,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
CUDA_ARCHS "${CUDA_ARCHS}")
|
CUDA_ARCHS "${CUDA_ARCHS}")
|
||||||
|
|
||||||
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
|
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
|
||||||
# moe marlin arches
|
# 9.0 for latest bf16 atomicAdd PTX
|
||||||
# note that we always set `use_atomic_add=False` for moe marlin now,
|
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
|
||||||
# so we don't need 9.0 for bf16 atomicAdd PTX
|
|
||||||
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
|
|
||||||
# moe marlin arches for fp8 input
|
|
||||||
# - sm80 doesn't support fp8 computation
|
|
||||||
# - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
|
|
||||||
# so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0)
|
|
||||||
cuda_archs_loose_intersection(MARLIN_MOE_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
|
|
||||||
if (MARLIN_MOE_ARCHS)
|
if (MARLIN_MOE_ARCHS)
|
||||||
|
|
||||||
#
|
#
|
||||||
@@ -994,18 +949,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
set(MOE_MARLIN_GEN_SCRIPT
|
set(MOE_MARLIN_GEN_SCRIPT
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
|
${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
|
||||||
file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
|
file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
|
||||||
list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
|
|
||||||
set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MOE_MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
|
|
||||||
|
|
||||||
message(STATUS "Marlin MOE generation script hash with arch: ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
|
message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
|
||||||
message(STATUS "Last run Marlin MOE generate script hash with arch: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
|
message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
|
||||||
|
|
||||||
if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
|
if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
|
||||||
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
|
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
|
||||||
execute_process(
|
execute_process(
|
||||||
COMMAND ${CMAKE_COMMAND} -E env
|
COMMAND ${CMAKE_COMMAND} -E env
|
||||||
PYTHONPATH=$ENV{PYTHONPATH}
|
PYTHONPATH=$PYTHONPATH
|
||||||
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
|
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
|
||||||
RESULT_VARIABLE moe_marlin_generation_result
|
RESULT_VARIABLE moe_marlin_generation_result
|
||||||
OUTPUT_VARIABLE moe_marlin_generation_output
|
OUTPUT_VARIABLE moe_marlin_generation_output
|
||||||
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
|
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
|
||||||
@@ -1018,7 +971,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
"\nCheck the log for details: "
|
"\nCheck the log for details: "
|
||||||
"${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
|
"${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
|
||||||
else()
|
else()
|
||||||
set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
|
set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
|
||||||
CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
|
CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
|
||||||
message(STATUS "Marlin MOE generation completed successfully.")
|
message(STATUS "Marlin MOE generation completed successfully.")
|
||||||
endif()
|
endif()
|
||||||
@@ -1026,28 +979,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
|
message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
file(GLOB MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu")
|
file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
|
||||||
list(APPEND MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/ops.cu")
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${MARLIN_MOE_SRC}"
|
SRCS "${MOE_WNAA16_MARLIN_SRC}"
|
||||||
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
|
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
||||||
set_source_files_properties(${MARLIN_MOE_SRC}
|
set_source_files_properties(${MOE_WNAA16_MARLIN_SRC}
|
||||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
||||||
endif()
|
endif()
|
||||||
list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
|
|
||||||
|
|
||||||
if (MARLIN_MOE_FP8_ARCHS)
|
list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
|
||||||
file(GLOB MARLIN_MOE_FP8_SRC "csrc/moe/marlin_moe_wna16/sm89_kernel_*.cu")
|
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${MARLIN_MOE_FP8_SRC}"
|
|
||||||
CUDA_ARCHS "${MARLIN_MOE_FP8_ARCHS}")
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
|
||||||
set_source_files_properties(${MARLIN_MOE_FP8_SRC}
|
|
||||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
|
||||||
endif()
|
|
||||||
list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_FP8_SRC})
|
|
||||||
endif()
|
|
||||||
|
|
||||||
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
|
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
|
||||||
else()
|
else()
|
||||||
@@ -1089,11 +1030,6 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
|
|||||||
WITH_SOABI)
|
WITH_SOABI)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# For CUDA and HIP builds also build the triton_kernels external package.
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
|
|
||||||
include(cmake/external_projects/triton_kernels.cmake)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# For CUDA we also build and ship some external projects.
|
# For CUDA we also build and ship some external projects.
|
||||||
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
include(cmake/external_projects/flashmla.cmake)
|
include(cmake/external_projects/flashmla.cmake)
|
||||||
|
|||||||
@@ -21,7 +21,6 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
|
|||||||
|
|
||||||
*Latest News* 🔥
|
*Latest News* 🔥
|
||||||
|
|
||||||
- [2025/11] We hosted [vLLM Bangkok Meetup](https://luma.com/v0f647nv). We explored vLLM and LMCache inference and low-resource language adaptation with speakers from Embedded LLM, AMD, and Red Hat. Please find the meetup slides [here](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing).
|
|
||||||
- [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
|
- [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
|
||||||
- [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
|
- [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
|
||||||
- [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
|
- [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
|
||||||
@@ -137,19 +136,16 @@ Compute Resources:
|
|||||||
- Alibaba Cloud
|
- Alibaba Cloud
|
||||||
- AMD
|
- AMD
|
||||||
- Anyscale
|
- Anyscale
|
||||||
- Arm
|
|
||||||
- AWS
|
- AWS
|
||||||
- Crusoe Cloud
|
- Crusoe Cloud
|
||||||
- Databricks
|
- Databricks
|
||||||
- DeepInfra
|
- DeepInfra
|
||||||
- Google Cloud
|
- Google Cloud
|
||||||
- IBM
|
|
||||||
- Intel
|
- Intel
|
||||||
- Lambda Lab
|
- Lambda Lab
|
||||||
- Nebius
|
- Nebius
|
||||||
- Novita AI
|
- Novita AI
|
||||||
- NVIDIA
|
- NVIDIA
|
||||||
- Red Hat
|
|
||||||
- Replicate
|
- Replicate
|
||||||
- Roblox
|
- Roblox
|
||||||
- RunPod
|
- RunPod
|
||||||
|
|||||||
@@ -83,7 +83,7 @@ MIN_CACHE_HIT_PCT=0
|
|||||||
MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
|
MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
|
||||||
```
|
```
|
||||||
|
|
||||||
### 2. Maximize Throughput with a Latency Requirement
|
#### 2. Maximize Throughput with a Latency Requirement
|
||||||
|
|
||||||
- **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms.
|
- **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms.
|
||||||
- **Configuration**:
|
- **Configuration**:
|
||||||
@@ -96,7 +96,7 @@ MIN_CACHE_HIT_PCT=0
|
|||||||
MAX_LATENCY_ALLOWED_MS=500
|
MAX_LATENCY_ALLOWED_MS=500
|
||||||
```
|
```
|
||||||
|
|
||||||
### 3. Maximize Throughput with Prefix Caching and Latency Requirements
|
#### 3. Maximize Throughput with Prefix Caching and Latency Requirements
|
||||||
|
|
||||||
- **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms.
|
- **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms.
|
||||||
- **Configuration**:
|
- **Configuration**:
|
||||||
|
|||||||
@@ -18,11 +18,6 @@ MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
|
|||||||
MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
|
MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
|
||||||
NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
|
NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
|
||||||
NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
|
NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
|
||||||
HOSTNAME=$(hostname)
|
|
||||||
if [[ -z "$HOSTNAME" ]]; then
|
|
||||||
echo "Error: Failed to determine hostname." >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
|
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
|
||||||
RESULT="$LOG_FOLDER/result.txt"
|
RESULT="$LOG_FOLDER/result.txt"
|
||||||
@@ -87,7 +82,6 @@ start_server() {
|
|||||||
"$MODEL"
|
"$MODEL"
|
||||||
"--disable-log-requests"
|
"--disable-log-requests"
|
||||||
"--port" "8004"
|
"--port" "8004"
|
||||||
"--host" "$HOSTNAME"
|
|
||||||
"--gpu-memory-utilization" "$gpu_memory_utilization"
|
"--gpu-memory-utilization" "$gpu_memory_utilization"
|
||||||
"--max-num-seqs" "$max_num_seqs"
|
"--max-num-seqs" "$max_num_seqs"
|
||||||
"--max-num-batched-tokens" "$max_num_batched_tokens"
|
"--max-num-batched-tokens" "$max_num_batched_tokens"
|
||||||
@@ -102,9 +96,8 @@ start_server() {
|
|||||||
# This correctly passes each element as a separate argument.
|
# This correctly passes each element as a separate argument.
|
||||||
if [[ -n "$profile_dir" ]]; then
|
if [[ -n "$profile_dir" ]]; then
|
||||||
# Start server with profiling enabled
|
# Start server with profiling enabled
|
||||||
local profile_config_json="{\"profiler\": \"torch\", \"torch_profiler_dir\": \"$profile_dir\"}"
|
VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
|
||||||
VLLM_SERVER_DEV_MODE=1 \
|
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
||||||
vllm serve --profiler-config "$profile_config_json" "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
|
||||||
else
|
else
|
||||||
# Start server without profiling
|
# Start server without profiling
|
||||||
VLLM_SERVER_DEV_MODE=1 \
|
VLLM_SERVER_DEV_MODE=1 \
|
||||||
@@ -119,7 +112,7 @@ start_server() {
|
|||||||
# since that we should always have permission to send signal to the server process.
|
# since that we should always have permission to send signal to the server process.
|
||||||
kill -0 $server_pid 2> /dev/null || break
|
kill -0 $server_pid 2> /dev/null || break
|
||||||
|
|
||||||
RESPONSE=$(curl -s -X GET "http://${HOSTNAME}:8004/health" -w "%{http_code}" -o /dev/stdout)
|
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
|
||||||
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
|
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
|
||||||
if [[ "$STATUS_CODE" -eq 200 ]]; then
|
if [[ "$STATUS_CODE" -eq 200 ]]; then
|
||||||
server_started=1
|
server_started=1
|
||||||
@@ -179,7 +172,6 @@ run_benchmark() {
|
|||||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||||
--num-prompts 1000 \
|
--num-prompts 1000 \
|
||||||
--random-prefix-len $prefix_len \
|
--random-prefix-len $prefix_len \
|
||||||
--host "$HOSTNAME" \
|
|
||||||
--port 8004 &> "$bm_log"
|
--port 8004 &> "$bm_log"
|
||||||
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||||
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
||||||
@@ -195,7 +187,7 @@ run_benchmark() {
|
|||||||
request_rate=$((${throughput%.*} + 1))
|
request_rate=$((${throughput%.*} + 1))
|
||||||
while ((request_rate > 0)); do
|
while ((request_rate > 0)); do
|
||||||
# clear prefix cache
|
# clear prefix cache
|
||||||
curl -X POST http://${HOSTNAME}:8004/reset_prefix_cache
|
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
|
||||||
sleep 5
|
sleep 5
|
||||||
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
|
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
|
||||||
vllm bench serve \
|
vllm bench serve \
|
||||||
@@ -211,7 +203,6 @@ run_benchmark() {
|
|||||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||||
--num-prompts 100 \
|
--num-prompts 100 \
|
||||||
--random-prefix-len $prefix_len \
|
--random-prefix-len $prefix_len \
|
||||||
--host "$HOSTNAME" \
|
|
||||||
--port 8004 &> "$bm_log"
|
--port 8004 &> "$bm_log"
|
||||||
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||||
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
||||||
@@ -312,7 +303,6 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
|
|||||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||||
--num-prompts 100 \
|
--num-prompts 100 \
|
||||||
--random-prefix-len $prefix_len \
|
--random-prefix-len $prefix_len \
|
||||||
--host "$HOSTNAME" \
|
|
||||||
--port 8004 \
|
--port 8004 \
|
||||||
--profile &> "$bm_log"
|
--profile &> "$bm_log"
|
||||||
else
|
else
|
||||||
|
|||||||
@@ -620,7 +620,7 @@ def get_tokenizer(
|
|||||||
kwargs["use_fast"] = False
|
kwargs["use_fast"] = False
|
||||||
if tokenizer_mode == "mistral":
|
if tokenizer_mode == "mistral":
|
||||||
try:
|
try:
|
||||||
from vllm.tokenizers.mistral import MistralTokenizer
|
from vllm.transformers_utils.tokenizer import MistralTokenizer
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"MistralTokenizer requires vllm package.\n"
|
"MistralTokenizer requires vllm package.\n"
|
||||||
|
|||||||
@@ -1,120 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
"""
|
|
||||||
Micro benchmark comparing built-in hash(), SHA-256, and xxHash.
|
|
||||||
|
|
||||||
This focuses on a single test payload shaped like the prefix-cache hash input:
|
|
||||||
(32-byte bytes object, 32-int tuple)
|
|
||||||
|
|
||||||
Usage:
|
|
||||||
python benchmarks/hash_micro_benchmark.py --iterations 20000
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import random
|
|
||||||
import statistics
|
|
||||||
import time
|
|
||||||
from collections.abc import Callable, Iterable
|
|
||||||
|
|
||||||
from vllm.utils.hashing import sha256, xxhash
|
|
||||||
|
|
||||||
|
|
||||||
def _generate_test_data(seed: int) -> tuple[bytes, tuple[int, ...]]:
|
|
||||||
"""Generate a deterministic test payload."""
|
|
||||||
random.seed(seed)
|
|
||||||
bytes_data = bytes(random.getrandbits(8) for _ in range(32))
|
|
||||||
int_tuple = tuple(random.randint(1, 1_000_000) for _ in range(32))
|
|
||||||
return (bytes_data, int_tuple)
|
|
||||||
|
|
||||||
|
|
||||||
def _benchmark_func(func: Callable[[tuple], object], data: tuple, iterations: int):
|
|
||||||
"""Return (avg_seconds, std_seconds) for hashing `data` `iterations` times."""
|
|
||||||
times: list[float] = []
|
|
||||||
|
|
||||||
# Warm-up to avoid first-run noise.
|
|
||||||
for _ in range(200):
|
|
||||||
func(data)
|
|
||||||
|
|
||||||
for _ in range(iterations):
|
|
||||||
start = time.perf_counter()
|
|
||||||
func(data)
|
|
||||||
end = time.perf_counter()
|
|
||||||
times.append(end - start)
|
|
||||||
|
|
||||||
avg = statistics.mean(times)
|
|
||||||
std = statistics.stdev(times) if len(times) > 1 else 0.0
|
|
||||||
return avg, std
|
|
||||||
|
|
||||||
|
|
||||||
def _run_benchmarks(
|
|
||||||
benchmarks: Iterable[tuple[str, Callable[[tuple], object]]],
|
|
||||||
data: tuple,
|
|
||||||
iterations: int,
|
|
||||||
):
|
|
||||||
"""Yield (name, avg, std) for each benchmark, skipping unavailable ones."""
|
|
||||||
for name, func in benchmarks:
|
|
||||||
try:
|
|
||||||
avg, std = _benchmark_func(func, data, iterations)
|
|
||||||
except ModuleNotFoundError as exc:
|
|
||||||
print(f"Skipping {name}: {exc}")
|
|
||||||
continue
|
|
||||||
yield name, avg, std
|
|
||||||
|
|
||||||
|
|
||||||
def builtin_hash(data: tuple) -> int:
|
|
||||||
"""Wrapper for Python's built-in hash()."""
|
|
||||||
return hash(data)
|
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
|
||||||
parser = argparse.ArgumentParser(description=__doc__)
|
|
||||||
parser.add_argument(
|
|
||||||
"--iterations",
|
|
||||||
type=int,
|
|
||||||
default=10_000,
|
|
||||||
help="Number of measured iterations per hash function.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--seed", type=int, default=42, help="Random seed for test payload."
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
data = _generate_test_data(args.seed)
|
|
||||||
benchmarks = (
|
|
||||||
("SHA256 (pickle)", sha256),
|
|
||||||
("xxHash (pickle)", xxhash),
|
|
||||||
("built-in hash()", builtin_hash),
|
|
||||||
)
|
|
||||||
|
|
||||||
print("=" * 60)
|
|
||||||
print("HASH FUNCTION MICRO BENCHMARK")
|
|
||||||
print("=" * 60)
|
|
||||||
print("Test data: (32-byte bytes object, 32-int tuple)")
|
|
||||||
print(f"Iterations: {args.iterations:,}")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
results = list(_run_benchmarks(benchmarks, data, args.iterations))
|
|
||||||
builtin_entry = next((r for r in results if r[0] == "built-in hash()"), None)
|
|
||||||
|
|
||||||
print("\nResults:")
|
|
||||||
for name, avg, std in results:
|
|
||||||
print(f" {name:16s}: {avg * 1e6:8.2f} ± {std * 1e6:6.2f} μs")
|
|
||||||
|
|
||||||
if builtin_entry:
|
|
||||||
_, builtin_avg, _ = builtin_entry
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print("SUMMARY (relative to built-in hash())")
|
|
||||||
print("=" * 60)
|
|
||||||
for name, avg, _ in results:
|
|
||||||
if name == "built-in hash()":
|
|
||||||
continue
|
|
||||||
speed_ratio = avg / builtin_avg
|
|
||||||
print(f"• {name} is {speed_ratio:.1f}x slower than built-in hash()")
|
|
||||||
else:
|
|
||||||
print("\nBuilt-in hash() result missing; cannot compute speed ratios.")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -32,11 +32,12 @@ def benchmark_propose(args):
|
|||||||
|
|
||||||
model_config = ModelConfig(
|
model_config = ModelConfig(
|
||||||
model="facebook/opt-125m",
|
model="facebook/opt-125m",
|
||||||
|
task="generate",
|
||||||
max_model_len=args.num_token + args.num_spec_token,
|
max_model_len=args.num_token + args.num_spec_token,
|
||||||
tokenizer="facebook/opt-125m",
|
tokenizer="facebook/opt-125m",
|
||||||
tokenizer_mode="auto",
|
tokenizer_mode="auto",
|
||||||
dtype="auto",
|
dtype="auto",
|
||||||
seed=0,
|
seed=None,
|
||||||
trust_remote_code=False,
|
trust_remote_code=False,
|
||||||
)
|
)
|
||||||
proposer = NgramProposer(
|
proposer = NgramProposer(
|
||||||
@@ -107,10 +108,7 @@ def benchmark_batched_propose(args):
|
|||||||
device_config=DeviceConfig(device=current_platform.device_type),
|
device_config=DeviceConfig(device=current_platform.device_type),
|
||||||
parallel_config=ParallelConfig(),
|
parallel_config=ParallelConfig(),
|
||||||
load_config=LoadConfig(),
|
load_config=LoadConfig(),
|
||||||
scheduler_config=SchedulerConfig(
|
scheduler_config=SchedulerConfig(),
|
||||||
max_model_len=model_config.max_model_len,
|
|
||||||
is_encoder_decoder=model_config.is_encoder_decoder,
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
|
# monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
|
||||||
|
|||||||
@@ -1,110 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
"""
|
|
||||||
Simple benchmark to compare prefix-cache block hashing algorithms.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
python benchmark_prefix_block_hash.py --num-blocks 20000 --block-size 32
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import random
|
|
||||||
import statistics
|
|
||||||
import sys
|
|
||||||
import time
|
|
||||||
from collections.abc import Callable, Iterable, Sequence
|
|
||||||
|
|
||||||
from vllm.utils.hashing import get_hash_fn_by_name
|
|
||||||
from vllm.v1.core.kv_cache_utils import BlockHash, hash_block_tokens, init_none_hash
|
|
||||||
|
|
||||||
SUPPORTED_ALGOS = ("sha256", "sha256_cbor", "xxhash", "xxhash_cbor")
|
|
||||||
|
|
||||||
|
|
||||||
def _generate_blocks(
|
|
||||||
num_blocks: int, block_size: int, vocab_size: int, seed: int
|
|
||||||
) -> list[list[int]]:
|
|
||||||
rng = random.Random(seed)
|
|
||||||
return [
|
|
||||||
[rng.randrange(vocab_size) for _ in range(block_size)]
|
|
||||||
for _ in range(num_blocks)
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def _hash_all_blocks(
|
|
||||||
hash_fn: Callable[[object], bytes],
|
|
||||||
blocks: Iterable[Sequence[int]],
|
|
||||||
) -> float:
|
|
||||||
parent_hash: BlockHash | None = None
|
|
||||||
start = time.perf_counter()
|
|
||||||
for block in blocks:
|
|
||||||
parent_hash = hash_block_tokens(hash_fn, parent_hash, block, extra_keys=None)
|
|
||||||
end = time.perf_counter()
|
|
||||||
return end - start
|
|
||||||
|
|
||||||
|
|
||||||
def _benchmark(
|
|
||||||
hash_algo: str,
|
|
||||||
blocks: list[list[int]],
|
|
||||||
trials: int,
|
|
||||||
) -> tuple[float, float, float] | None:
|
|
||||||
try:
|
|
||||||
hash_fn = get_hash_fn_by_name(hash_algo)
|
|
||||||
init_none_hash(hash_fn)
|
|
||||||
timings = [_hash_all_blocks(hash_fn, blocks) for _ in range(trials)]
|
|
||||||
except ModuleNotFoundError as exc:
|
|
||||||
print(f"Skipping {hash_algo}: {exc}", file=sys.stderr)
|
|
||||||
return None
|
|
||||||
|
|
||||||
avg = statistics.mean(timings)
|
|
||||||
best = min(timings)
|
|
||||||
# throughput: tokens / second
|
|
||||||
tokens_hashed = len(blocks) * len(blocks[0])
|
|
||||||
throughput = tokens_hashed / best
|
|
||||||
return avg, best, throughput
|
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
|
||||||
parser = argparse.ArgumentParser(description=__doc__)
|
|
||||||
parser.add_argument("--num-blocks", type=int, default=10000, help="Block count.")
|
|
||||||
parser.add_argument("--block-size", type=int, default=32, help="Tokens per block.")
|
|
||||||
parser.add_argument(
|
|
||||||
"--vocab-size", type=int, default=32000, help="Token id range [0, vocab_size)."
|
|
||||||
)
|
|
||||||
parser.add_argument("--seed", type=int, default=0, help="Random seed.")
|
|
||||||
parser.add_argument(
|
|
||||||
"--trials", type=int, default=5, help="Number of timed trials per algorithm."
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--algorithms",
|
|
||||||
nargs="+",
|
|
||||||
default=SUPPORTED_ALGOS,
|
|
||||||
choices=SUPPORTED_ALGOS,
|
|
||||||
help="Hash algorithms to benchmark.",
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
blocks = _generate_blocks(
|
|
||||||
args.num_blocks, args.block_size, args.vocab_size, args.seed
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
f"Benchmarking {len(args.algorithms)} algorithms on "
|
|
||||||
f"{args.num_blocks} blocks (block size={args.block_size})."
|
|
||||||
)
|
|
||||||
|
|
||||||
for algo in args.algorithms:
|
|
||||||
result = _benchmark(algo, blocks, args.trials)
|
|
||||||
if result is None:
|
|
||||||
continue
|
|
||||||
|
|
||||||
avg, best, throughput = result
|
|
||||||
print(
|
|
||||||
f"{algo:14s} avg: {avg:.6f}s best: {best:.6f}s "
|
|
||||||
f"throughput: {throughput / 1e6:.2f}M tokens/s"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -40,7 +40,7 @@ from vllm.engine.arg_utils import EngineArgs
|
|||||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from vllm.tokenizers import get_tokenizer
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from backend_request_func import get_tokenizer
|
from backend_request_func import get_tokenizer
|
||||||
|
|
||||||
|
|||||||
@@ -46,7 +46,7 @@ from tqdm.asyncio import tqdm
|
|||||||
from transformers import PreTrainedTokenizerBase
|
from transformers import PreTrainedTokenizerBase
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from vllm.tokenizers import get_tokenizer
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from backend_request_func import get_tokenizer
|
from backend_request_func import get_tokenizer
|
||||||
|
|
||||||
@@ -574,7 +574,7 @@ async def benchmark(
|
|||||||
)
|
)
|
||||||
print(
|
print(
|
||||||
"{:<40} {:<10.2f}".format(
|
"{:<40} {:<10.2f}".format(
|
||||||
"Total token throughput (tok/s):", metrics.total_token_throughput
|
"Total Token throughput (tok/s):", metrics.total_token_throughput
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -963,7 +963,8 @@ def create_argument_parser():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--profile",
|
"--profile",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="Use vLLM Profiling. --profiler-config must be provided on the server.",
|
help="Use Torch Profiler. The endpoint must be launched with "
|
||||||
|
"VLLM_TORCH_PROFILER_DIR to enable profiler.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--result-dir",
|
"--result-dir",
|
||||||
|
|||||||
@@ -5,12 +5,11 @@ import argparse
|
|||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import time
|
|
||||||
import uuid
|
|
||||||
from urllib.parse import urlparse
|
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
from quart import Quart, Response, make_response, request
|
from quart import Quart, Response, make_response, request
|
||||||
|
from rate_limiter import RateLimiter
|
||||||
|
from request_queue import RequestQueue
|
||||||
|
|
||||||
# Configure logging
|
# Configure logging
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
@@ -25,8 +24,26 @@ def parse_args():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--timeout",
|
"--timeout",
|
||||||
type=float,
|
type=float,
|
||||||
default=6 * 60 * 60,
|
default=300,
|
||||||
help="Timeout for backend service requests in seconds (default: 21600)",
|
help="Timeout for backend service requests in seconds (default: 300)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-concurrent",
|
||||||
|
type=int,
|
||||||
|
default=100,
|
||||||
|
help="Maximum concurrent requests to backend services (default: 100)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--queue-size",
|
||||||
|
type=int,
|
||||||
|
default=500,
|
||||||
|
help="Maximum number of requests in the queue (default: 500)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--rate-limit",
|
||||||
|
type=int,
|
||||||
|
default=40,
|
||||||
|
help="Maximum requests per second (default: 40)",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--port",
|
"--port",
|
||||||
@@ -37,32 +54,14 @@ def parse_args():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--prefill-url",
|
"--prefill-url",
|
||||||
type=str,
|
type=str,
|
||||||
default="http://localhost:8100",
|
default="http://localhost:8100/v1/completions",
|
||||||
help="Prefill service base URL (protocol + host[:port])",
|
help="Prefill service endpoint URL",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--decode-url",
|
"--decode-url",
|
||||||
type=str,
|
type=str,
|
||||||
default="http://localhost:8200",
|
default="http://localhost:8200/v1/completions",
|
||||||
help="Decode service base URL (protocol + host[:port])",
|
help="Decode service endpoint URL",
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--kv-host",
|
|
||||||
type=str,
|
|
||||||
default="localhost",
|
|
||||||
help="Hostname or IP used by KV transfer (default: localhost)",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--prefill-kv-port",
|
|
||||||
type=int,
|
|
||||||
default=14579,
|
|
||||||
help="Prefill KV port (default: 14579)",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--decode-kv-port",
|
|
||||||
type=int,
|
|
||||||
default=14580,
|
|
||||||
help="Decode KV port (default: 14580)",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
@@ -74,129 +73,70 @@ def main():
|
|||||||
|
|
||||||
# Initialize configuration using command line parameters
|
# Initialize configuration using command line parameters
|
||||||
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=args.timeout)
|
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=args.timeout)
|
||||||
|
MAX_CONCURRENT_REQUESTS = args.max_concurrent
|
||||||
|
REQUEST_QUEUE_SIZE = args.queue_size
|
||||||
|
RATE_LIMIT = args.rate_limit
|
||||||
PREFILL_SERVICE_URL = args.prefill_url
|
PREFILL_SERVICE_URL = args.prefill_url
|
||||||
DECODE_SERVICE_URL = args.decode_url
|
DECODE_SERVICE_URL = args.decode_url
|
||||||
PORT = args.port
|
PORT = args.port
|
||||||
|
|
||||||
PREFILL_KV_ADDR = f"{args.kv_host}:{args.prefill_kv_port}"
|
|
||||||
DECODE_KV_ADDR = f"{args.kv_host}:{args.decode_kv_port}"
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
"Proxy resolved KV addresses -> prefill: %s, decode: %s",
|
|
||||||
PREFILL_KV_ADDR,
|
|
||||||
DECODE_KV_ADDR,
|
|
||||||
)
|
|
||||||
|
|
||||||
app = Quart(__name__)
|
app = Quart(__name__)
|
||||||
|
|
||||||
# Attach the configuration object to the application instance so helper
|
# Initialize the rate limiter and request queue
|
||||||
# coroutines can read the resolved backend URLs and timeouts without using
|
rate_limiter = RateLimiter(RATE_LIMIT)
|
||||||
# globals.
|
request_queue = RequestQueue(MAX_CONCURRENT_REQUESTS, REQUEST_QUEUE_SIZE)
|
||||||
|
|
||||||
|
# Attach the configuration object to the application instance
|
||||||
app.config.update(
|
app.config.update(
|
||||||
{
|
{
|
||||||
"AIOHTTP_TIMEOUT": AIOHTTP_TIMEOUT,
|
"AIOHTTP_TIMEOUT": AIOHTTP_TIMEOUT,
|
||||||
|
"rate_limiter": rate_limiter,
|
||||||
|
"request_queue": request_queue,
|
||||||
"PREFILL_SERVICE_URL": PREFILL_SERVICE_URL,
|
"PREFILL_SERVICE_URL": PREFILL_SERVICE_URL,
|
||||||
"DECODE_SERVICE_URL": DECODE_SERVICE_URL,
|
"DECODE_SERVICE_URL": DECODE_SERVICE_URL,
|
||||||
"PREFILL_KV_ADDR": PREFILL_KV_ADDR,
|
|
||||||
"DECODE_KV_ADDR": DECODE_KV_ADDR,
|
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
def _normalize_base_url(url: str) -> str:
|
# Start queue processing on app startup
|
||||||
"""Remove any trailing slash so path joins behave predictably."""
|
@app.before_serving
|
||||||
return url.rstrip("/")
|
async def startup():
|
||||||
|
"""Start request processing task when app starts serving"""
|
||||||
|
asyncio.create_task(request_queue.process())
|
||||||
|
|
||||||
def _get_host_port(url: str) -> str:
|
async def forward_request(url, data):
|
||||||
"""Return the hostname:port portion for logging and KV headers."""
|
"""Forward request to backend service with rate limiting and error handling"""
|
||||||
parsed = urlparse(url)
|
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
||||||
host = parsed.hostname or "localhost"
|
|
||||||
port = parsed.port
|
|
||||||
if port is None:
|
|
||||||
port = 80 if parsed.scheme == "http" else 443
|
|
||||||
return f"{host}:{port}"
|
|
||||||
|
|
||||||
PREFILL_BASE = _normalize_base_url(PREFILL_SERVICE_URL)
|
# Use rate limiter as context manager
|
||||||
DECODE_BASE = _normalize_base_url(DECODE_SERVICE_URL)
|
async with (
|
||||||
KV_TARGET = _get_host_port(DECODE_SERVICE_URL)
|
rate_limiter,
|
||||||
|
aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
|
||||||
def _build_headers(request_id: str) -> dict[str, str]:
|
):
|
||||||
"""Construct the headers expected by vLLM's P2P disagg connector."""
|
try:
|
||||||
headers: dict[str, str] = {"X-Request-Id": request_id, "X-KV-Target": KV_TARGET}
|
async with session.post(
|
||||||
api_key = os.environ.get("OPENAI_API_KEY")
|
url=url, json=data, headers=headers
|
||||||
if api_key:
|
) as response:
|
||||||
headers["Authorization"] = f"Bearer {api_key}"
|
if response.status == 200:
|
||||||
return headers
|
# Stream response chunks
|
||||||
|
async for chunk_bytes in response.content.iter_chunked(1024):
|
||||||
async def _run_prefill(
|
yield chunk_bytes
|
||||||
request_path: str,
|
else:
|
||||||
payload: dict,
|
# Handle backend service errors
|
||||||
headers: dict[str, str],
|
error_text = await response.text()
|
||||||
request_id: str,
|
logger.error(
|
||||||
):
|
"Backend service error: %s - %s",
|
||||||
url = f"{PREFILL_BASE}{request_path}"
|
response.status,
|
||||||
start_ts = time.perf_counter()
|
error_text,
|
||||||
logger.info("[prefill] start request_id=%s url=%s", request_id, url)
|
)
|
||||||
try:
|
yield b'{"error": "Backend service error"}'
|
||||||
async with (
|
except aiohttp.ClientError as e:
|
||||||
aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
|
# Handle connection errors
|
||||||
session.post(url=url, json=payload, headers=headers) as resp,
|
logger.error("Connection error to %s: %s", url, str(e))
|
||||||
):
|
yield b'{"error": "Service unavailable"}'
|
||||||
if resp.status != 200:
|
except asyncio.TimeoutError:
|
||||||
error_text = await resp.text()
|
# Handle timeout errors
|
||||||
raise RuntimeError(
|
logger.error("Timeout connecting to %s", url)
|
||||||
f"Prefill backend error {resp.status}: {error_text}"
|
yield b'{"error": "Service timeout"}'
|
||||||
)
|
|
||||||
await resp.read()
|
|
||||||
logger.info(
|
|
||||||
"[prefill] done request_id=%s status=%s elapsed=%.2fs",
|
|
||||||
request_id,
|
|
||||||
resp.status,
|
|
||||||
time.perf_counter() - start_ts,
|
|
||||||
)
|
|
||||||
except asyncio.TimeoutError as exc:
|
|
||||||
raise RuntimeError(f"Prefill service timeout at {url}") from exc
|
|
||||||
except aiohttp.ClientError as exc:
|
|
||||||
raise RuntimeError(f"Prefill service unavailable at {url}") from exc
|
|
||||||
|
|
||||||
async def _stream_decode(
|
|
||||||
request_path: str,
|
|
||||||
payload: dict,
|
|
||||||
headers: dict[str, str],
|
|
||||||
request_id: str,
|
|
||||||
):
|
|
||||||
url = f"{DECODE_BASE}{request_path}"
|
|
||||||
# Stream tokens from the decode service once the prefill stage has
|
|
||||||
# materialized KV caches on the target workers.
|
|
||||||
logger.info("[decode] start request_id=%s url=%s", request_id, url)
|
|
||||||
try:
|
|
||||||
async with (
|
|
||||||
aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
|
|
||||||
session.post(url=url, json=payload, headers=headers) as resp,
|
|
||||||
):
|
|
||||||
if resp.status != 200:
|
|
||||||
error_text = await resp.text()
|
|
||||||
logger.error(
|
|
||||||
"Decode backend error %s - %s", resp.status, error_text
|
|
||||||
)
|
|
||||||
err_msg = (
|
|
||||||
'{"error": "Decode backend error ' + str(resp.status) + '"}'
|
|
||||||
)
|
|
||||||
yield err_msg.encode()
|
|
||||||
return
|
|
||||||
logger.info(
|
|
||||||
"[decode] streaming response request_id=%s status=%s",
|
|
||||||
request_id,
|
|
||||||
resp.status,
|
|
||||||
)
|
|
||||||
async for chunk_bytes in resp.content.iter_chunked(1024):
|
|
||||||
yield chunk_bytes
|
|
||||||
logger.info("[decode] finished streaming request_id=%s", request_id)
|
|
||||||
except asyncio.TimeoutError:
|
|
||||||
logger.error("Decode service timeout at %s", url)
|
|
||||||
yield b'{"error": "Decode service timeout"}'
|
|
||||||
except aiohttp.ClientError as exc:
|
|
||||||
logger.error("Decode service error at %s: %s", url, exc)
|
|
||||||
yield b'{"error": "Decode service unavailable"}'
|
|
||||||
|
|
||||||
async def process_request():
|
async def process_request():
|
||||||
"""Process a single request through prefill and decode stages"""
|
"""Process a single request through prefill and decode stages"""
|
||||||
@@ -206,27 +146,13 @@ def main():
|
|||||||
# Create prefill request (max_tokens=1)
|
# Create prefill request (max_tokens=1)
|
||||||
prefill_request = original_request_data.copy()
|
prefill_request = original_request_data.copy()
|
||||||
prefill_request["max_tokens"] = 1
|
prefill_request["max_tokens"] = 1
|
||||||
if "max_completion_tokens" in prefill_request:
|
|
||||||
prefill_request["max_completion_tokens"] = 1
|
|
||||||
|
|
||||||
# Execute prefill stage
|
# Execute prefill stage
|
||||||
# The request id encodes both KV socket addresses so the backend can
|
async for _ in forward_request(PREFILL_SERVICE_URL, prefill_request):
|
||||||
# shuttle tensors directly via NCCL once the prefill response
|
continue
|
||||||
# completes.
|
|
||||||
request_id = (
|
|
||||||
f"___prefill_addr_{PREFILL_KV_ADDR}___decode_addr_"
|
|
||||||
f"{DECODE_KV_ADDR}_{uuid.uuid4().hex}"
|
|
||||||
)
|
|
||||||
|
|
||||||
headers = _build_headers(request_id)
|
|
||||||
await _run_prefill(request.path, prefill_request, headers, request_id)
|
|
||||||
|
|
||||||
# Execute decode stage and stream response
|
# Execute decode stage and stream response
|
||||||
# Pass the unmodified user request so the decode phase can continue
|
generator = forward_request(DECODE_SERVICE_URL, original_request_data)
|
||||||
# sampling with the already-populated KV cache.
|
|
||||||
generator = _stream_decode(
|
|
||||||
request.path, original_request_data, headers, request_id
|
|
||||||
)
|
|
||||||
response = await make_response(generator)
|
response = await make_response(generator)
|
||||||
response.timeout = None # Disable timeout for streaming response
|
response.timeout = None # Disable timeout for streaming response
|
||||||
return response
|
return response
|
||||||
@@ -242,10 +168,23 @@ def main():
|
|||||||
@app.route("/v1/completions", methods=["POST"])
|
@app.route("/v1/completions", methods=["POST"])
|
||||||
async def handle_request():
|
async def handle_request():
|
||||||
"""Handle incoming API requests with concurrency and rate limiting"""
|
"""Handle incoming API requests with concurrency and rate limiting"""
|
||||||
|
# Create task for request processing
|
||||||
|
task = asyncio.create_task(process_request())
|
||||||
|
|
||||||
|
# Enqueue request or reject if queue is full
|
||||||
|
if not await request_queue.enqueue(task):
|
||||||
|
return Response(
|
||||||
|
response=b'{"error": "Server busy, try again later"}',
|
||||||
|
status=503,
|
||||||
|
content_type="application/json",
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return await process_request()
|
# Return the response from the processing task
|
||||||
|
return await task
|
||||||
except asyncio.CancelledError:
|
except asyncio.CancelledError:
|
||||||
logger.warning("Request cancelled")
|
# Handle task cancellation (timeout or queue full)
|
||||||
|
logger.warning("Request cancelled due to timeout or queue full")
|
||||||
return Response(
|
return Response(
|
||||||
response=b'{"error": "Request cancelled"}',
|
response=b'{"error": "Request cancelled"}',
|
||||||
status=503,
|
status=503,
|
||||||
|
|||||||
@@ -14,9 +14,6 @@ from tqdm import tqdm
|
|||||||
|
|
||||||
import vllm._custom_ops as ops
|
import vllm._custom_ops as ops
|
||||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
|
||||||
per_token_group_quant_fp8,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -25,7 +22,6 @@ class bench_params_t:
|
|||||||
hidden_size: int
|
hidden_size: int
|
||||||
add_residual: bool
|
add_residual: bool
|
||||||
dtype: torch.dtype
|
dtype: torch.dtype
|
||||||
group_size: list[int]
|
|
||||||
|
|
||||||
def description(self):
|
def description(self):
|
||||||
return (
|
return (
|
||||||
@@ -33,7 +29,6 @@ class bench_params_t:
|
|||||||
f"x D {self.hidden_size} "
|
f"x D {self.hidden_size} "
|
||||||
f"x R {self.add_residual} "
|
f"x R {self.add_residual} "
|
||||||
f"x DT {self.dtype}"
|
f"x DT {self.dtype}"
|
||||||
f"x GS {self.group_size}"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -43,11 +38,10 @@ def get_bench_params() -> list[bench_params_t]:
|
|||||||
HIDDEN_SIZES = list(range(1024, 8129, 1024))
|
HIDDEN_SIZES = list(range(1024, 8129, 1024))
|
||||||
ADD_RESIDUAL = [True, False]
|
ADD_RESIDUAL = [True, False]
|
||||||
DTYPES = [torch.bfloat16, torch.float]
|
DTYPES = [torch.bfloat16, torch.float]
|
||||||
GROUP_SIZES = [[1, 64], [1, 128]]
|
|
||||||
|
|
||||||
combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES, GROUP_SIZES)
|
combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES)
|
||||||
bench_params = list(
|
bench_params = list(
|
||||||
map(lambda x: bench_params_t(x[0], x[1], x[2], x[3], x[4]), combinations)
|
map(lambda x: bench_params_t(x[0], x[1], x[2], x[3]), combinations)
|
||||||
)
|
)
|
||||||
return bench_params
|
return bench_params
|
||||||
|
|
||||||
@@ -58,7 +52,6 @@ def unfused_int8_impl(
|
|||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: torch.Tensor | None,
|
residual: torch.Tensor | None,
|
||||||
quant_dtype: torch.dtype,
|
quant_dtype: torch.dtype,
|
||||||
group_size: list[int],
|
|
||||||
):
|
):
|
||||||
# Norm
|
# Norm
|
||||||
torch_out = None
|
torch_out = None
|
||||||
@@ -76,7 +69,6 @@ def unfused_fp8_impl(
|
|||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: torch.Tensor | None,
|
residual: torch.Tensor | None,
|
||||||
quant_dtype: torch.dtype,
|
quant_dtype: torch.dtype,
|
||||||
group_size: list[int],
|
|
||||||
):
|
):
|
||||||
# Norm
|
# Norm
|
||||||
torch_out = None
|
torch_out = None
|
||||||
@@ -89,63 +81,23 @@ def unfused_fp8_impl(
|
|||||||
torch_out, _ = ops.scaled_fp8_quant(torch_out)
|
torch_out, _ = ops.scaled_fp8_quant(torch_out)
|
||||||
|
|
||||||
|
|
||||||
def unfused_groupwise_fp8_impl(
|
|
||||||
rms_norm_layer: RMSNorm,
|
|
||||||
x: torch.Tensor,
|
|
||||||
residual: torch.Tensor | None,
|
|
||||||
quant_dtype: torch.dtype,
|
|
||||||
group_size: list[int],
|
|
||||||
):
|
|
||||||
# Norm
|
|
||||||
torch_out = None
|
|
||||||
if residual is None:
|
|
||||||
torch_out = rms_norm_layer.forward_cuda(x, residual)
|
|
||||||
else:
|
|
||||||
torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
|
|
||||||
|
|
||||||
# Quant
|
|
||||||
torch_out, _ = per_token_group_quant_fp8(
|
|
||||||
torch_out, group_size=group_size[1], use_ue8m0=False
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def fused_impl(
|
def fused_impl(
|
||||||
rms_norm_layer: RMSNorm, # this stores the weights
|
rms_norm_layer: RMSNorm, # this stores the weights
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: torch.Tensor | None,
|
residual: torch.Tensor | None,
|
||||||
quant_dtype: torch.dtype,
|
quant_dtype: torch.dtype,
|
||||||
group_size: list[int],
|
|
||||||
):
|
):
|
||||||
out, _ = ops.rms_norm_dynamic_per_token_quant(
|
out, _ = ops.rms_norm_dynamic_per_token_quant(
|
||||||
x, rms_norm_layer.weight, 1e-6, quant_dtype, residual=residual
|
x, rms_norm_layer.weight, 1e-6, quant_dtype, residual=residual
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def fused_groupwise_impl(
|
|
||||||
rms_norm_layer: RMSNorm, # this stores the weights
|
|
||||||
x: torch.Tensor,
|
|
||||||
residual: torch.Tensor | None,
|
|
||||||
quant_dtype: torch.dtype,
|
|
||||||
group_size: list[int],
|
|
||||||
):
|
|
||||||
out, _ = ops.rms_norm_per_block_quant(
|
|
||||||
x,
|
|
||||||
rms_norm_layer.weight,
|
|
||||||
1e-6,
|
|
||||||
quant_dtype,
|
|
||||||
group_size,
|
|
||||||
residual=residual,
|
|
||||||
is_scale_transposed=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# Bench functions
|
# Bench functions
|
||||||
def bench_fn(
|
def bench_fn(
|
||||||
rms_norm_layer: RMSNorm,
|
rms_norm_layer: RMSNorm,
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: torch.Tensor,
|
residual: torch.Tensor,
|
||||||
quant_dtype: torch.dtype,
|
quant_dtype: torch.dtype,
|
||||||
group_size: list[int],
|
|
||||||
label: str,
|
label: str,
|
||||||
sub_label: str,
|
sub_label: str,
|
||||||
fn: Callable,
|
fn: Callable,
|
||||||
@@ -158,11 +110,10 @@ def bench_fn(
|
|||||||
"x": x,
|
"x": x,
|
||||||
"residual": residual,
|
"residual": residual,
|
||||||
"quant_dtype": quant_dtype,
|
"quant_dtype": quant_dtype,
|
||||||
"group_size": group_size,
|
|
||||||
"fn": fn,
|
"fn": fn,
|
||||||
}
|
}
|
||||||
return TBenchmark.Timer(
|
return TBenchmark.Timer(
|
||||||
stmt="fn(rms_norm_layer, x, residual, quant_dtype, group_size)",
|
stmt="fn(rms_norm_layer, x, residual, quant_dtype)",
|
||||||
globals=globals,
|
globals=globals,
|
||||||
label=label,
|
label=label,
|
||||||
sub_label=sub_label,
|
sub_label=sub_label,
|
||||||
@@ -196,7 +147,6 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
|
|||||||
x,
|
x,
|
||||||
residual,
|
residual,
|
||||||
torch.int8,
|
torch.int8,
|
||||||
params.group_size,
|
|
||||||
label,
|
label,
|
||||||
sub_label,
|
sub_label,
|
||||||
unfused_int8_impl,
|
unfused_int8_impl,
|
||||||
@@ -211,7 +161,6 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
|
|||||||
x,
|
x,
|
||||||
residual,
|
residual,
|
||||||
torch.float8_e4m3fn,
|
torch.float8_e4m3fn,
|
||||||
params.group_size,
|
|
||||||
label,
|
label,
|
||||||
sub_label,
|
sub_label,
|
||||||
unfused_fp8_impl,
|
unfused_fp8_impl,
|
||||||
@@ -226,7 +175,6 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
|
|||||||
x,
|
x,
|
||||||
residual,
|
residual,
|
||||||
torch.int8,
|
torch.int8,
|
||||||
params.group_size,
|
|
||||||
label,
|
label,
|
||||||
sub_label,
|
sub_label,
|
||||||
fused_impl,
|
fused_impl,
|
||||||
@@ -241,7 +189,6 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
|
|||||||
x,
|
x,
|
||||||
residual,
|
residual,
|
||||||
torch.float8_e4m3fn,
|
torch.float8_e4m3fn,
|
||||||
params.group_size,
|
|
||||||
label,
|
label,
|
||||||
sub_label,
|
sub_label,
|
||||||
fused_impl,
|
fused_impl,
|
||||||
@@ -249,36 +196,6 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
# unfused groupwise fp8 impl.
|
|
||||||
timers.append(
|
|
||||||
bench_fn(
|
|
||||||
layer,
|
|
||||||
x,
|
|
||||||
residual,
|
|
||||||
torch.float8_e4m3fn,
|
|
||||||
params.group_size,
|
|
||||||
label,
|
|
||||||
sub_label,
|
|
||||||
unfused_groupwise_fp8_impl,
|
|
||||||
"unfused_groupwise_fp8_impl",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# fused groupwise fp8 impl.
|
|
||||||
timers.append(
|
|
||||||
bench_fn(
|
|
||||||
layer,
|
|
||||||
x,
|
|
||||||
residual,
|
|
||||||
torch.float8_e4m3fn,
|
|
||||||
params.group_size,
|
|
||||||
label,
|
|
||||||
sub_label,
|
|
||||||
fused_groupwise_impl,
|
|
||||||
"fused_groupwise_fp8_impl",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
print_timers(timers)
|
print_timers(timers)
|
||||||
|
|
||||||
return timers
|
return timers
|
||||||
|
|||||||
@@ -1,244 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from enum import Enum
|
|
||||||
from itertools import product
|
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
import torch
|
|
||||||
import torch.utils.benchmark as TBenchmark
|
|
||||||
from torch.utils.benchmark import Measurement as TMeasurement
|
|
||||||
|
|
||||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
|
||||||
_per_token_group_quant_fp8_colmajor,
|
|
||||||
silu_mul_per_token_group_quant_fp8_colmajor,
|
|
||||||
)
|
|
||||||
from vllm.triton_utils import triton
|
|
||||||
from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
|
|
||||||
|
|
||||||
from .utils import ArgPool, Bench, CudaGraphBenchParams
|
|
||||||
|
|
||||||
GROUP_SIZE = 128
|
|
||||||
FLOAT8_T = torch.float8_e4m3fn
|
|
||||||
|
|
||||||
|
|
||||||
def print_timers(timers: list[TMeasurement], cuda_graph_nops: int):
|
|
||||||
print(
|
|
||||||
f"Note : The timings reported above is for {cuda_graph_nops} "
|
|
||||||
"consecutive invocations of the benchmarking functions. "
|
|
||||||
f"Please divide by {cuda_graph_nops} for single invocation "
|
|
||||||
"timings."
|
|
||||||
)
|
|
||||||
compare = TBenchmark.Compare(timers)
|
|
||||||
compare.print()
|
|
||||||
|
|
||||||
|
|
||||||
class ImplType(Enum):
|
|
||||||
SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR = 1
|
|
||||||
REFERENCE = 2
|
|
||||||
|
|
||||||
def get_impl(self):
|
|
||||||
if self == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR:
|
|
||||||
return silu_mul_per_token_group_quant_fp8_colmajor
|
|
||||||
elif self == ImplType.REFERENCE:
|
|
||||||
return reference
|
|
||||||
raise ValueError(f"Unrecognized ImplType {self}")
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class BenchmarkTensors:
|
|
||||||
input: torch.Tensor
|
|
||||||
output: torch.Tensor
|
|
||||||
|
|
||||||
# Reference act output tensor
|
|
||||||
ref_act_out: torch.Tensor
|
|
||||||
ref_quant_out: torch.Tensor
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def make(T: int, N: int) -> "BenchmarkTensors":
|
|
||||||
assert T % GROUP_SIZE == 0
|
|
||||||
assert N % (GROUP_SIZE * 2) == 0
|
|
||||||
|
|
||||||
input = torch.rand((T, N), dtype=torch.bfloat16, device="cuda")
|
|
||||||
|
|
||||||
# silu_mul_per_token_group_quant_fp8_colmajor output.
|
|
||||||
output = torch.rand((T, N // 2), dtype=torch.bfloat16, device="cuda").to(
|
|
||||||
FLOAT8_T
|
|
||||||
)
|
|
||||||
|
|
||||||
# reference output.
|
|
||||||
ref_act_out = torch.empty((T, N // 2), dtype=torch.bfloat16, device="cuda")
|
|
||||||
ref_quant_out = torch.empty(
|
|
||||||
(T, N // 2), dtype=torch.bfloat16, device="cuda"
|
|
||||||
).to(FLOAT8_T)
|
|
||||||
|
|
||||||
return BenchmarkTensors(
|
|
||||||
input=input,
|
|
||||||
output=output,
|
|
||||||
ref_act_out=ref_act_out,
|
|
||||||
ref_quant_out=ref_quant_out,
|
|
||||||
)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def T(self):
|
|
||||||
return self.input.size(0)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def N(self):
|
|
||||||
return self.input.size(1)
|
|
||||||
|
|
||||||
def make_impl_kwargs(self, impl_type: ImplType) -> dict[str, Any]:
|
|
||||||
if impl_type == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR:
|
|
||||||
return {
|
|
||||||
"input": self.input,
|
|
||||||
"output": self.output,
|
|
||||||
"use_ue8m0": is_deep_gemm_e8m0_used(),
|
|
||||||
}
|
|
||||||
elif impl_type == ImplType.REFERENCE:
|
|
||||||
return {
|
|
||||||
"input": self.input,
|
|
||||||
"act_out": self.ref_act_out,
|
|
||||||
"quant_out": self.ref_quant_out,
|
|
||||||
"use_ue8m0": is_deep_gemm_e8m0_used(),
|
|
||||||
}
|
|
||||||
raise ValueError(f"Unrecognized impl_type {impl_type}")
|
|
||||||
|
|
||||||
|
|
||||||
def reference_quant(x: torch.Tensor, quant_out: torch.Tensor, use_ue8m0: bool):
|
|
||||||
"""
|
|
||||||
Reference triton quant kernel from,
|
|
||||||
vllm.model_executor.layers.quantization.utils.fp8_utils
|
|
||||||
"""
|
|
||||||
assert quant_out.size() == x.size()
|
|
||||||
# Allocate the scale tensor column-major format.
|
|
||||||
shape = (x.shape[-1] // GROUP_SIZE,) + x.shape[:-1]
|
|
||||||
x_q = quant_out
|
|
||||||
x_s = torch.empty(shape, device=x.device, dtype=torch.float32).permute(-1, -2)
|
|
||||||
|
|
||||||
M = x.numel() // GROUP_SIZE
|
|
||||||
N = GROUP_SIZE
|
|
||||||
BLOCK = triton.next_power_of_2(N)
|
|
||||||
# heuristics for number of warps
|
|
||||||
num_warps = min(max(BLOCK // 256, 1), 8)
|
|
||||||
num_stages = 1
|
|
||||||
|
|
||||||
finfo = torch.finfo(FLOAT8_T)
|
|
||||||
fp8_min = finfo.min
|
|
||||||
fp8_max = finfo.max
|
|
||||||
|
|
||||||
_per_token_group_quant_fp8_colmajor[(M,)](
|
|
||||||
x,
|
|
||||||
x_q,
|
|
||||||
x_s,
|
|
||||||
GROUP_SIZE,
|
|
||||||
x.shape[1],
|
|
||||||
x.stride(0),
|
|
||||||
x_s.stride(1),
|
|
||||||
eps=1e-10,
|
|
||||||
fp8_min=fp8_min,
|
|
||||||
fp8_max=fp8_max,
|
|
||||||
use_ue8m0=use_ue8m0,
|
|
||||||
BLOCK=BLOCK,
|
|
||||||
num_warps=num_warps,
|
|
||||||
num_stages=num_stages,
|
|
||||||
)
|
|
||||||
return x_q, x_s
|
|
||||||
|
|
||||||
|
|
||||||
def reference(
|
|
||||||
input: torch.Tensor,
|
|
||||||
act_out: torch.Tensor,
|
|
||||||
quant_out: torch.Tensor,
|
|
||||||
use_ue8m0: bool,
|
|
||||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
|
||||||
torch.ops._C.silu_and_mul(act_out, input)
|
|
||||||
return reference_quant(act_out, quant_out, use_ue8m0)
|
|
||||||
|
|
||||||
|
|
||||||
def bench_impl(
|
|
||||||
bench_tensors: list[BenchmarkTensors], impl_type: ImplType
|
|
||||||
) -> TMeasurement:
|
|
||||||
T = bench_tensors[0].T
|
|
||||||
N = bench_tensors[0].N
|
|
||||||
|
|
||||||
arg_pool_size = len(bench_tensors)
|
|
||||||
kwargs_list = [bt.make_impl_kwargs(impl_type) for bt in bench_tensors]
|
|
||||||
|
|
||||||
# warmup
|
|
||||||
for kwargs in kwargs_list:
|
|
||||||
impl_type.get_impl()(**kwargs)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
|
|
||||||
# Merge into a single kwargs and qualify arguments as ArgPool
|
|
||||||
kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
|
|
||||||
for _kwargs in kwargs_list:
|
|
||||||
for k, v in _kwargs.items():
|
|
||||||
kwargs[k].values.append(v)
|
|
||||||
|
|
||||||
cuda_graph_params = None
|
|
||||||
cuda_graph_params = CudaGraphBenchParams(arg_pool_size)
|
|
||||||
timer = None
|
|
||||||
with Bench(
|
|
||||||
cuda_graph_params,
|
|
||||||
"silu-mul-quant",
|
|
||||||
f"num_tokens={T}, N={N}",
|
|
||||||
impl_type.name,
|
|
||||||
impl_type.get_impl(),
|
|
||||||
**kwargs,
|
|
||||||
) as bench:
|
|
||||||
timer = bench.run()
|
|
||||||
return timer
|
|
||||||
|
|
||||||
|
|
||||||
def test_correctness(T: int, N: int):
|
|
||||||
print(f"Testing num_tokens={T}, N={N} ...")
|
|
||||||
|
|
||||||
bench_tensor = BenchmarkTensors.make(T, N)
|
|
||||||
|
|
||||||
def output_from_impl(impl: ImplType) -> tuple[torch.Tensor, torch.Tensor]:
|
|
||||||
return impl.get_impl()(**bench_tensor.make_impl_kwargs(impl))
|
|
||||||
|
|
||||||
# reference output
|
|
||||||
ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE)
|
|
||||||
|
|
||||||
# test ouptut
|
|
||||||
out_q, out_s = output_from_impl(
|
|
||||||
ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
|
|
||||||
)
|
|
||||||
|
|
||||||
torch.testing.assert_close(ref_out_q.to(torch.float32), out_q.to(torch.float32))
|
|
||||||
torch.testing.assert_close(ref_out_s, out_s)
|
|
||||||
|
|
||||||
|
|
||||||
def run(Ts: list[int], Ns: list[int], arg_pool_size: int) -> list[TMeasurement]:
|
|
||||||
timers = []
|
|
||||||
for N, T in product(Ns, Ts):
|
|
||||||
test_correctness(T, N)
|
|
||||||
|
|
||||||
bench_tensors: list[BenchmarkTensors] = [
|
|
||||||
BenchmarkTensors.make(T, N) for _ in range(arg_pool_size)
|
|
||||||
]
|
|
||||||
|
|
||||||
silu_mul_quant_timer = bench_impl(
|
|
||||||
bench_tensors, ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
|
|
||||||
)
|
|
||||||
timers.append(silu_mul_quant_timer)
|
|
||||||
reference_timer = bench_impl(bench_tensors, ImplType.REFERENCE)
|
|
||||||
timers.append(reference_timer)
|
|
||||||
|
|
||||||
print_timers(
|
|
||||||
[silu_mul_quant_timer, reference_timer], cuda_graph_nops=arg_pool_size
|
|
||||||
)
|
|
||||||
|
|
||||||
print_timers(timers, cuda_graph_nops=arg_pool_size)
|
|
||||||
|
|
||||||
return timers
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
T = [128 * i for i in range(1, 16)] + [2048 * i for i in range(1, 65)]
|
|
||||||
N = [2048, 4096, 8192]
|
|
||||||
|
|
||||||
print(f"T = {T}, N = {N}")
|
|
||||||
run(T, N, arg_pool_size=8)
|
|
||||||
@@ -255,8 +255,8 @@ def bench_run(
|
|||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
# Timing
|
# Timing
|
||||||
start_event = torch.Event(enable_timing=True)
|
start_event = torch.cuda.Event(enable_timing=True)
|
||||||
end_event = torch.Event(enable_timing=True)
|
end_event = torch.cuda.Event(enable_timing=True)
|
||||||
|
|
||||||
latencies = []
|
latencies = []
|
||||||
for _ in range(num_iters):
|
for _ in range(num_iters):
|
||||||
|
|||||||
@@ -237,7 +237,6 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
|
|||||||
b_q_weight=w_q,
|
b_q_weight=w_q,
|
||||||
b_bias=None,
|
b_bias=None,
|
||||||
b_scales=w_s,
|
b_scales=w_s,
|
||||||
a_scales=None,
|
|
||||||
global_scale=None,
|
global_scale=None,
|
||||||
b_zeros=w_zp,
|
b_zeros=w_zp,
|
||||||
g_idx=g_idx,
|
g_idx=g_idx,
|
||||||
|
|||||||
@@ -263,7 +263,7 @@ def bench_run(
|
|||||||
|
|
||||||
results.append(
|
results.append(
|
||||||
benchmark.Timer(
|
benchmark.Timer(
|
||||||
stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501
|
stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501
|
||||||
globals=globals,
|
globals=globals,
|
||||||
label=label,
|
label=label,
|
||||||
sub_label=sub_label,
|
sub_label=sub_label,
|
||||||
@@ -273,7 +273,7 @@ def bench_run(
|
|||||||
|
|
||||||
results.append(
|
results.append(
|
||||||
benchmark.Timer(
|
benchmark.Timer(
|
||||||
stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501
|
stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501
|
||||||
globals=globals,
|
globals=globals,
|
||||||
label=label,
|
label=label,
|
||||||
sub_label=sub_label,
|
sub_label=sub_label,
|
||||||
|
|||||||
@@ -1,150 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
"""
|
|
||||||
Benchmark script comparing torch.cat vs direct copy for k_nope/k_pe concatenation
|
|
||||||
in MLA (Multi-head Latent Attention) prefill.
|
|
||||||
|
|
||||||
This validates that the optimization from commit 8d4142bd is beneficial across
|
|
||||||
various batch sizes, not just the originally tested batch size of 32768.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import time
|
|
||||||
from collections.abc import Callable
|
|
||||||
|
|
||||||
import torch
|
|
||||||
|
|
||||||
# DeepSeek-V3 MLA dimensions
|
|
||||||
NUM_HEADS = 128
|
|
||||||
QK_NOPE_HEAD_DIM = 128
|
|
||||||
PE_DIM = 64
|
|
||||||
|
|
||||||
|
|
||||||
def cat_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
|
|
||||||
"""Original torch.cat approach with expand."""
|
|
||||||
return torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
|
|
||||||
|
|
||||||
|
|
||||||
def direct_copy_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
|
|
||||||
"""Optimized direct copy approach (avoids expand + cat overhead)."""
|
|
||||||
k = torch.empty(
|
|
||||||
(*k_nope.shape[:-1], k_nope.shape[-1] + k_pe.shape[-1]),
|
|
||||||
dtype=k_nope.dtype,
|
|
||||||
device=k_nope.device,
|
|
||||||
)
|
|
||||||
k[..., : k_nope.shape[-1]] = k_nope
|
|
||||||
k[..., k_nope.shape[-1] :] = k_pe
|
|
||||||
return k
|
|
||||||
|
|
||||||
|
|
||||||
def benchmark_method(
|
|
||||||
method: Callable,
|
|
||||||
k_nope: torch.Tensor,
|
|
||||||
k_pe: torch.Tensor,
|
|
||||||
num_warmup: int = 10,
|
|
||||||
num_iters: int = 100,
|
|
||||||
) -> float:
|
|
||||||
"""Benchmark a concatenation method and return mean latency in ms."""
|
|
||||||
# Warmup
|
|
||||||
for _ in range(num_warmup):
|
|
||||||
_ = method(k_nope, k_pe)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
|
|
||||||
# Benchmark
|
|
||||||
start = time.perf_counter()
|
|
||||||
for _ in range(num_iters):
|
|
||||||
_ = method(k_nope, k_pe)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
end = time.perf_counter()
|
|
||||||
|
|
||||||
return (end - start) / num_iters * 1000 # Convert to ms
|
|
||||||
|
|
||||||
|
|
||||||
@torch.inference_mode()
|
|
||||||
def run_benchmark(dtype: torch.dtype, dtype_name: str):
|
|
||||||
"""Run benchmark for a specific dtype."""
|
|
||||||
torch.set_default_device("cuda")
|
|
||||||
|
|
||||||
# Batch sizes to test (powers of 2 from 32 to 65536)
|
|
||||||
batch_sizes = [32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536]
|
|
||||||
|
|
||||||
print("=" * 80)
|
|
||||||
print("Benchmark: torch.cat vs direct copy for MLA k_nope/k_pe concatenation")
|
|
||||||
print("=" * 80)
|
|
||||||
print(
|
|
||||||
f"Tensor shapes: k_nope=[B, {NUM_HEADS}, {QK_NOPE_HEAD_DIM}], "
|
|
||||||
f"k_pe=[B, 1, {PE_DIM}]"
|
|
||||||
)
|
|
||||||
print(f"dtype: {dtype_name}")
|
|
||||||
print()
|
|
||||||
print(
|
|
||||||
f"{'Batch Size':>12} | {'cat (ms)':>10} | {'direct (ms)':>12} | "
|
|
||||||
f"{'Speedup':>8} | {'Reduction':>10}"
|
|
||||||
)
|
|
||||||
print("-" * 70)
|
|
||||||
|
|
||||||
results = []
|
|
||||||
for batch_size in batch_sizes:
|
|
||||||
# Create input tensors (generate in float32 then convert for FP8 compatibility)
|
|
||||||
k_nope = torch.randn(
|
|
||||||
batch_size, NUM_HEADS, QK_NOPE_HEAD_DIM, dtype=torch.float32, device="cuda"
|
|
||||||
).to(dtype)
|
|
||||||
k_pe = torch.randn(
|
|
||||||
batch_size, 1, PE_DIM, dtype=torch.float32, device="cuda"
|
|
||||||
).to(dtype)
|
|
||||||
|
|
||||||
# Benchmark both methods
|
|
||||||
cat_time = benchmark_method(cat_method, k_nope, k_pe)
|
|
||||||
direct_time = benchmark_method(direct_copy_method, k_nope, k_pe)
|
|
||||||
|
|
||||||
speedup = cat_time / direct_time
|
|
||||||
reduction = (1 - direct_time / cat_time) * 100
|
|
||||||
|
|
||||||
results.append((batch_size, cat_time, direct_time, speedup, reduction))
|
|
||||||
|
|
||||||
print(
|
|
||||||
f"{batch_size:>12} | {cat_time:>10.3f} | {direct_time:>12.3f} | "
|
|
||||||
f"{speedup:>7.2f}x | {reduction:>9.1f}%"
|
|
||||||
)
|
|
||||||
|
|
||||||
print("=" * 80)
|
|
||||||
|
|
||||||
# Summary statistics
|
|
||||||
speedups = [r[3] for r in results]
|
|
||||||
print("\nSpeedup summary:")
|
|
||||||
print(f" Min: {min(speedups):.2f}x")
|
|
||||||
print(f" Max: {max(speedups):.2f}x")
|
|
||||||
print(f" Mean: {sum(speedups) / len(speedups):.2f}x")
|
|
||||||
|
|
||||||
# Find crossover point
|
|
||||||
crossover_batch = None
|
|
||||||
for batch_size, _, _, speedup, _ in results:
|
|
||||||
if speedup >= 1.0:
|
|
||||||
crossover_batch = batch_size
|
|
||||||
break
|
|
||||||
|
|
||||||
print("\nConclusion:")
|
|
||||||
if crossover_batch:
|
|
||||||
print(f" - Direct copy becomes beneficial at batch size >= {crossover_batch}")
|
|
||||||
# Filter for large batches (>= 512 which is typical for prefill)
|
|
||||||
large_batch_speedups = [r[3] for r in results if r[0] >= 512]
|
|
||||||
if large_batch_speedups:
|
|
||||||
avg_large = sum(large_batch_speedups) / len(large_batch_speedups)
|
|
||||||
print(f" - For batch sizes >= 512: avg speedup = {avg_large:.2f}x")
|
|
||||||
print(" - MLA prefill typically uses large batches, so optimization is effective")
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
|
||||||
|
|
||||||
@torch.inference_mode()
|
|
||||||
def main():
|
|
||||||
# Test bfloat16
|
|
||||||
print("\n")
|
|
||||||
run_benchmark(torch.bfloat16, "bfloat16")
|
|
||||||
|
|
||||||
# Test float8_e4m3fn
|
|
||||||
print("\n")
|
|
||||||
run_benchmark(torch.float8_e4m3fn, "float8_e4m3fn")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -185,8 +185,8 @@ def benchmark_config(
|
|||||||
graph.replay()
|
graph.replay()
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
start_event = torch.Event(enable_timing=True)
|
start_event = torch.cuda.Event(enable_timing=True)
|
||||||
end_event = torch.Event(enable_timing=True)
|
end_event = torch.cuda.Event(enable_timing=True)
|
||||||
|
|
||||||
latencies: list[float] = []
|
latencies: list[float] = []
|
||||||
for i in range(num_iters):
|
for i in range(num_iters):
|
||||||
|
|||||||
@@ -24,15 +24,12 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
|
|||||||
num_tokens_range = [1, 16, 256, 4096]
|
num_tokens_range = [1, 16, 256, 4096]
|
||||||
num_experts_range = [16, 64, 224, 256, 280, 512]
|
num_experts_range = [16, 64, 224, 256, 280, 512]
|
||||||
topk_range = [1, 2, 8]
|
topk_range = [1, 2, 8]
|
||||||
ep_size_range = [1, 8]
|
configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
|
||||||
configs = list(
|
|
||||||
itertools.product(num_tokens_range, num_experts_range, topk_range, ep_size_range)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@triton.testing.perf_report(
|
@triton.testing.perf_report(
|
||||||
triton.testing.Benchmark(
|
triton.testing.Benchmark(
|
||||||
x_names=["num_tokens", "num_experts", "topk", "ep_size"],
|
x_names=["num_tokens", "num_experts", "topk"],
|
||||||
x_vals=configs,
|
x_vals=configs,
|
||||||
line_arg="provider",
|
line_arg="provider",
|
||||||
line_vals=["vllm"],
|
line_vals=["vllm"],
|
||||||
@@ -41,26 +38,16 @@ configs = list(
|
|||||||
args={},
|
args={},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
def benchmark(num_tokens, num_experts, topk, ep_size, provider):
|
def benchmark(num_tokens, num_experts, topk, provider):
|
||||||
"""Benchmark function for Triton."""
|
"""Benchmark function for Triton."""
|
||||||
block_size = 256
|
block_size = 256
|
||||||
torch.cuda.manual_seed_all(0)
|
|
||||||
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
|
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
|
||||||
|
|
||||||
e_map = None
|
|
||||||
if ep_size != 1:
|
|
||||||
local_e = num_experts // ep_size
|
|
||||||
e_ids = torch.randperm(num_experts, device="cuda", dtype=torch.int32)[:local_e]
|
|
||||||
e_map = torch.full((num_experts,), -1, device="cuda", dtype=torch.int32)
|
|
||||||
e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32)
|
|
||||||
|
|
||||||
quantiles = [0.5, 0.2, 0.8]
|
quantiles = [0.5, 0.2, 0.8]
|
||||||
|
|
||||||
if provider == "vllm":
|
if provider == "vllm":
|
||||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||||
lambda: moe_align_block_size(
|
lambda: moe_align_block_size(topk_ids, block_size, num_experts),
|
||||||
topk_ids, block_size, num_experts, e_map, ignore_invalid_experts=True
|
|
||||||
),
|
|
||||||
quantiles=quantiles,
|
quantiles=quantiles,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -105,8 +105,8 @@ def benchmark_permute(
|
|||||||
graph.replay()
|
graph.replay()
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
start_event = torch.Event(enable_timing=True)
|
start_event = torch.cuda.Event(enable_timing=True)
|
||||||
end_event = torch.Event(enable_timing=True)
|
end_event = torch.cuda.Event(enable_timing=True)
|
||||||
|
|
||||||
latencies: list[float] = []
|
latencies: list[float] = []
|
||||||
for i in range(num_iters):
|
for i in range(num_iters):
|
||||||
@@ -241,8 +241,8 @@ def benchmark_unpermute(
|
|||||||
graph.replay()
|
graph.replay()
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
start_event = torch.Event(enable_timing=True)
|
start_event = torch.cuda.Event(enable_timing=True)
|
||||||
end_event = torch.Event(enable_timing=True)
|
end_event = torch.cuda.Event(enable_timing=True)
|
||||||
|
|
||||||
latencies: list[float] = []
|
latencies: list[float] = []
|
||||||
for i in range(num_iters):
|
for i in range(num_iters):
|
||||||
|
|||||||
@@ -6,7 +6,7 @@
|
|||||||
#
|
#
|
||||||
# The CSV file (named with current date/time) contains these columns:
|
# The CSV file (named with current date/time) contains these columns:
|
||||||
# model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position,
|
# model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position,
|
||||||
# is_neox_style, rope_parameters, dtype, torch_mean, torch_median, torch_p99,
|
# rope_theta, is_neox_style, rope_scaling, dtype, torch_mean, torch_median, torch_p99,
|
||||||
# torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max,
|
# torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max,
|
||||||
# speedup
|
# speedup
|
||||||
#
|
#
|
||||||
@@ -86,8 +86,9 @@ def benchmark_mrope(
|
|||||||
num_heads: int,
|
num_heads: int,
|
||||||
num_kv_heads: int,
|
num_kv_heads: int,
|
||||||
max_position: int = 8192,
|
max_position: int = 8192,
|
||||||
|
rope_theta: float = 10000,
|
||||||
is_neox_style: bool = True,
|
is_neox_style: bool = True,
|
||||||
rope_parameters: dict[str, Any] | None = None,
|
rope_scaling: dict[str, Any] = None,
|
||||||
dtype: torch.dtype = torch.bfloat16,
|
dtype: torch.dtype = torch.bfloat16,
|
||||||
seed: int = 0,
|
seed: int = 0,
|
||||||
warmup_iter: int = 10,
|
warmup_iter: int = 10,
|
||||||
@@ -99,9 +100,11 @@ def benchmark_mrope(
|
|||||||
# the parameters to compute the q k v size based on tp_size
|
# the parameters to compute the q k v size based on tp_size
|
||||||
mrope_helper_class = get_rope(
|
mrope_helper_class = get_rope(
|
||||||
head_size=head_dim,
|
head_size=head_dim,
|
||||||
|
rotary_dim=head_dim,
|
||||||
max_position=max_position,
|
max_position=max_position,
|
||||||
|
base=rope_theta,
|
||||||
is_neox_style=is_neox_style,
|
is_neox_style=is_neox_style,
|
||||||
rope_parameters=rope_parameters,
|
rope_scaling=rope_scaling,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
).to(device=device)
|
).to(device=device)
|
||||||
|
|
||||||
@@ -200,8 +203,9 @@ def benchmark_mrope(
|
|||||||
num_kv_heads,
|
num_kv_heads,
|
||||||
head_dim,
|
head_dim,
|
||||||
max_position,
|
max_position,
|
||||||
|
rope_theta,
|
||||||
is_neox_style,
|
is_neox_style,
|
||||||
str(rope_parameters),
|
str(rope_scaling),
|
||||||
str(dtype).split(".")[-1],
|
str(dtype).split(".")[-1],
|
||||||
torch_stats["mean"],
|
torch_stats["mean"],
|
||||||
torch_stats["median"],
|
torch_stats["median"],
|
||||||
@@ -251,8 +255,9 @@ if __name__ == "__main__":
|
|||||||
"num_kv_heads",
|
"num_kv_heads",
|
||||||
"head_dim",
|
"head_dim",
|
||||||
"max_position",
|
"max_position",
|
||||||
|
"rope_theta",
|
||||||
"is_neox_style",
|
"is_neox_style",
|
||||||
"rope_parameters",
|
"rope_scaling",
|
||||||
"dtype",
|
"dtype",
|
||||||
"torch_mean",
|
"torch_mean",
|
||||||
"torch_median",
|
"torch_median",
|
||||||
@@ -298,7 +303,7 @@ if __name__ == "__main__":
|
|||||||
q_size = num_heads * head_dim
|
q_size = num_heads * head_dim
|
||||||
kv_size = num_kv_heads * head_dim
|
kv_size = num_kv_heads * head_dim
|
||||||
is_neox_style = True
|
is_neox_style = True
|
||||||
rope_parameters = config.rope_parameters
|
rope_theta = config.rope_theta
|
||||||
max_position = config.max_position_embeddings
|
max_position = config.max_position_embeddings
|
||||||
|
|
||||||
for num_tokens in num_tokens_list:
|
for num_tokens in num_tokens_list:
|
||||||
@@ -310,8 +315,9 @@ if __name__ == "__main__":
|
|||||||
num_heads=num_heads,
|
num_heads=num_heads,
|
||||||
num_kv_heads=num_kv_heads,
|
num_kv_heads=num_kv_heads,
|
||||||
max_position=max_position,
|
max_position=max_position,
|
||||||
|
rope_theta=rope_theta,
|
||||||
is_neox_style=is_neox_style,
|
is_neox_style=is_neox_style,
|
||||||
rope_parameters=rope_parameters,
|
rope_scaling=config.rope_scaling,
|
||||||
dtype=getattr(torch, args.dtype),
|
dtype=getattr(torch, args.dtype),
|
||||||
seed=args.seed,
|
seed=args.seed,
|
||||||
warmup_iter=args.warmup_iter,
|
warmup_iter=args.warmup_iter,
|
||||||
|
|||||||
@@ -30,8 +30,8 @@ def _time_cuda(
|
|||||||
fn()
|
fn()
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
start = torch.Event(enable_timing=True)
|
start = torch.cuda.Event(enable_timing=True)
|
||||||
end = torch.Event(enable_timing=True)
|
end = torch.cuda.Event(enable_timing=True)
|
||||||
|
|
||||||
start.record()
|
start.record()
|
||||||
for _ in range(bench_iters):
|
for _ in range(bench_iters):
|
||||||
|
|||||||
@@ -32,8 +32,8 @@ def get_benchmark(head_size, rotary_dim, is_neox_style, device):
|
|||||||
def benchmark(batch_size, seq_len, num_heads, provider):
|
def benchmark(batch_size, seq_len, num_heads, provider):
|
||||||
dtype = torch.bfloat16
|
dtype = torch.bfloat16
|
||||||
max_position = 8192
|
max_position = 8192
|
||||||
rope_parameters = {"partial_rotary_factor": rotary_dim / head_size}
|
base = 10000
|
||||||
rope = get_rope(head_size, max_position, is_neox_style, rope_parameters)
|
rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
|
||||||
rope = rope.to(dtype=dtype, device=device)
|
rope = rope.to(dtype=dtype, device=device)
|
||||||
cos_sin_cache = rope.cos_sin_cache.to(dtype=torch.float, device=device)
|
cos_sin_cache = rope.cos_sin_cache.to(dtype=torch.float, device=device)
|
||||||
|
|
||||||
|
|||||||
@@ -253,8 +253,8 @@ def benchmark(
|
|||||||
)
|
)
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
start_event = torch.Event(enable_timing=True)
|
start_event = torch.cuda.Event(enable_timing=True)
|
||||||
end_event = torch.Event(enable_timing=True)
|
end_event = torch.cuda.Event(enable_timing=True)
|
||||||
|
|
||||||
# Benchmark
|
# Benchmark
|
||||||
latencies: list[float] = []
|
latencies: list[float] = []
|
||||||
|
|||||||
@@ -127,8 +127,8 @@ def benchmark_decode(
|
|||||||
|
|
||||||
def time_fn(fn, warmup=10, trials=20):
|
def time_fn(fn, warmup=10, trials=20):
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
start = torch.Event(enable_timing=True)
|
start = torch.cuda.Event(enable_timing=True)
|
||||||
end = torch.Event(enable_timing=True)
|
end = torch.cuda.Event(enable_timing=True)
|
||||||
times = []
|
times = []
|
||||||
for i in range(warmup):
|
for i in range(warmup):
|
||||||
fn()
|
fn()
|
||||||
|
|||||||
@@ -139,8 +139,8 @@ def benchmark_prefill(
|
|||||||
|
|
||||||
def time_fn(fn, warmup=10, trials=20):
|
def time_fn(fn, warmup=10, trials=20):
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
start = torch.Event(enable_timing=True)
|
start = torch.cuda.Event(enable_timing=True)
|
||||||
end = torch.Event(enable_timing=True)
|
end = torch.cuda.Event(enable_timing=True)
|
||||||
times = []
|
times = []
|
||||||
for i in range(warmup):
|
for i in range(warmup):
|
||||||
fn()
|
fn()
|
||||||
|
|||||||
@@ -183,8 +183,8 @@ def benchmark_config(
|
|||||||
run()
|
run()
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
start_event = torch.Event(enable_timing=True)
|
start_event = torch.cuda.Event(enable_timing=True)
|
||||||
end_event = torch.Event(enable_timing=True)
|
end_event = torch.cuda.Event(enable_timing=True)
|
||||||
|
|
||||||
latencies: list[float] = []
|
latencies: list[float] = []
|
||||||
for i in range(num_iters):
|
for i in range(num_iters):
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
This directory includes benchmarks between DeepSeek's DeepGEMM block fp8 kernels against vLLM's existing triton and CUTLASS-based kernels.
|
This directory includes benchmarks between DeepSeek's DeepGEMM block fp8 kernels against vLLM's existing triton and CUTLASS-based kernels.
|
||||||
|
|
||||||
Currently, this just includes dense GEMMs and only works on Hopper GPUs.
|
Currently this just includes dense GEMMs and only works on Hopper GPUs.
|
||||||
|
|
||||||
## Setup
|
## Setup
|
||||||
|
|
||||||
|
|||||||
@@ -55,10 +55,6 @@ output_num_chunks 166.0 99.01 11.80 79.00 90.00 98.00 108.75
|
|||||||
----------------------------------------------------------------------------------------------------
|
----------------------------------------------------------------------------------------------------
|
||||||
```
|
```
|
||||||
|
|
||||||
If you run with `--warmup-step`, the summary will also include `warmup_runtime_sec`
|
|
||||||
and `total_runtime_incl_warmup_sec` (while `runtime_sec` continues to reflect the
|
|
||||||
benchmark-only runtime so the reported throughput stays comparable).
|
|
||||||
|
|
||||||
### JSON configuration file for synthetic conversations generation
|
### JSON configuration file for synthetic conversations generation
|
||||||
|
|
||||||
The input flag `--input-file` is used to determine the input conversations for the benchmark.<br/>
|
The input flag `--input-file` is used to determine the input conversations for the benchmark.<br/>
|
||||||
|
|||||||
@@ -1076,7 +1076,6 @@ def process_statistics(
|
|||||||
verbose: bool,
|
verbose: bool,
|
||||||
gen_conv_args: GenConvArgs | None = None,
|
gen_conv_args: GenConvArgs | None = None,
|
||||||
excel_output: bool = False,
|
excel_output: bool = False,
|
||||||
warmup_runtime_sec: float | None = None,
|
|
||||||
) -> None:
|
) -> None:
|
||||||
if len(client_metrics) == 0:
|
if len(client_metrics) == 0:
|
||||||
logger.info("No samples to process")
|
logger.info("No samples to process")
|
||||||
@@ -1170,13 +1169,8 @@ def process_statistics(
|
|||||||
# Convert milliseconds to seconds
|
# Convert milliseconds to seconds
|
||||||
runtime_sec = runtime_sec / 1000.0
|
runtime_sec = runtime_sec / 1000.0
|
||||||
requests_per_sec = float(len(df)) / runtime_sec
|
requests_per_sec = float(len(df)) / runtime_sec
|
||||||
params = {
|
|
||||||
"runtime_sec": runtime_sec,
|
params = {"runtime_sec": runtime_sec, "requests_per_sec": requests_per_sec}
|
||||||
"requests_per_sec": requests_per_sec,
|
|
||||||
}
|
|
||||||
if warmup_runtime_sec is not None:
|
|
||||||
params["warmup_runtime_sec"] = warmup_runtime_sec
|
|
||||||
params["total_runtime_incl_warmup_sec"] = runtime_sec + warmup_runtime_sec
|
|
||||||
|
|
||||||
# Generate a summary of relevant metrics (and drop irrelevant data)
|
# Generate a summary of relevant metrics (and drop irrelevant data)
|
||||||
df = df.drop(columns=exclude).describe(percentiles=percentiles).transpose()
|
df = df.drop(columns=exclude).describe(percentiles=percentiles).transpose()
|
||||||
@@ -1558,8 +1552,6 @@ async def main() -> None:
|
|||||||
url=args.url, num_clients=args.num_clients, early_stop=not args.no_early_stop
|
url=args.url, num_clients=args.num_clients, early_stop=not args.no_early_stop
|
||||||
)
|
)
|
||||||
|
|
||||||
warmup_runtime_sec: float | None = None
|
|
||||||
|
|
||||||
# Warm-up step
|
# Warm-up step
|
||||||
if args.warmup_step:
|
if args.warmup_step:
|
||||||
# Only send a single user prompt from every conversation.
|
# Only send a single user prompt from every conversation.
|
||||||
@@ -1574,56 +1566,26 @@ async def main() -> None:
|
|||||||
# all clients should finish their work before exiting
|
# all clients should finish their work before exiting
|
||||||
warmup_bench_args = bench_args._replace(early_stop=False)
|
warmup_bench_args = bench_args._replace(early_stop=False)
|
||||||
|
|
||||||
logger.info("%sWarmup start%s", Color.PURPLE, Color.RESET)
|
logger.info(f"{Color.PURPLE}Warmup start{Color.RESET}")
|
||||||
warmup_start_ns = time.perf_counter_ns()
|
|
||||||
conversations, _ = await main_mp(
|
conversations, _ = await main_mp(
|
||||||
warmup_client_args, req_args, warmup_bench_args, tokenizer, conversations
|
warmup_client_args, req_args, warmup_bench_args, tokenizer, conversations
|
||||||
)
|
)
|
||||||
warmup_runtime_sec = nanosec_to_sec(time.perf_counter_ns() - warmup_start_ns)
|
logger.info(f"{Color.PURPLE}Warmup done{Color.RESET}")
|
||||||
logger.info(
|
|
||||||
"%sWarmup runtime: %.3f sec (%.3f ms)%s",
|
|
||||||
Color.PURPLE,
|
|
||||||
warmup_runtime_sec,
|
|
||||||
warmup_runtime_sec * 1000,
|
|
||||||
Color.RESET,
|
|
||||||
)
|
|
||||||
logger.info("%sWarmup done%s", Color.PURPLE, Color.RESET)
|
|
||||||
|
|
||||||
# Run the benchmark
|
# Run the benchmark
|
||||||
benchmark_start_ns = time.perf_counter_ns()
|
start_time = time.perf_counter_ns()
|
||||||
client_convs, client_metrics = await main_mp(
|
client_convs, client_metrics = await main_mp(
|
||||||
client_args, req_args, bench_args, tokenizer, conversations
|
client_args, req_args, bench_args, tokenizer, conversations
|
||||||
)
|
)
|
||||||
benchmark_runtime_sec = nanosec_to_sec(time.perf_counter_ns() - benchmark_start_ns)
|
total_runtime_ms = nanosec_to_millisec(time.perf_counter_ns() - start_time)
|
||||||
|
|
||||||
# Calculate requests per second
|
# Calculate requests per second
|
||||||
requests_per_sec = len(client_metrics) / benchmark_runtime_sec
|
total_runtime_sec = total_runtime_ms / 1000.0
|
||||||
benchmark_runtime_ms = benchmark_runtime_sec * 1000.0
|
rps = len(client_metrics) / total_runtime_sec
|
||||||
logger.info(
|
logger.info(
|
||||||
"%sAll clients finished, benchmark runtime: %.3f sec (%.3f ms), "
|
f"{Color.GREEN}All clients finished, total runtime: {total_runtime_sec:.3f} sec"
|
||||||
"requests per second: %.3f%s",
|
f" ({total_runtime_ms:.3f} ms), requests per second: {rps:.3f}{Color.RESET}"
|
||||||
Color.GREEN,
|
|
||||||
benchmark_runtime_sec,
|
|
||||||
benchmark_runtime_ms,
|
|
||||||
requests_per_sec,
|
|
||||||
Color.RESET,
|
|
||||||
)
|
)
|
||||||
if warmup_runtime_sec is not None:
|
|
||||||
total_runtime_sec = benchmark_runtime_sec + warmup_runtime_sec
|
|
||||||
logger.info(
|
|
||||||
"%sWarmup runtime: %.3f sec (%.3f ms)%s",
|
|
||||||
Color.GREEN,
|
|
||||||
warmup_runtime_sec,
|
|
||||||
warmup_runtime_sec * 1000,
|
|
||||||
Color.RESET,
|
|
||||||
)
|
|
||||||
logger.info(
|
|
||||||
"%sTotal runtime (including warmup): %.3f sec (%.3f ms)%s",
|
|
||||||
Color.GREEN,
|
|
||||||
total_runtime_sec,
|
|
||||||
total_runtime_sec * 1000,
|
|
||||||
Color.RESET,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Benchmark parameters
|
# Benchmark parameters
|
||||||
params = {
|
params = {
|
||||||
@@ -1648,7 +1610,6 @@ async def main() -> None:
|
|||||||
verbose=args.verbose,
|
verbose=args.verbose,
|
||||||
gen_conv_args=gen_conv_args,
|
gen_conv_args=gen_conv_args,
|
||||||
excel_output=args.excel_output,
|
excel_output=args.excel_output,
|
||||||
warmup_runtime_sec=warmup_runtime_sec,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if args.output_file is not None:
|
if args.output_file is not None:
|
||||||
|
|||||||
@@ -251,6 +251,17 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Build ACL with CMake
|
# Build ACL with CMake
|
||||||
|
set(ARM_COMPUTE_BUILD_SHARED_LIB "OFF")
|
||||||
|
set(CMAKE_BUILD_TYPE "Release")
|
||||||
|
set(ARM_COMPUTE_ARCH "armv8.2-a")
|
||||||
|
set(ARM_COMPUTE_ENABLE_ASSERTS "OFF")
|
||||||
|
set(ARM_COMPUTE_ENABLE_CPPTHREADS "OFF")
|
||||||
|
set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
|
||||||
|
set(ARM_COMPUTE_ENABLE_OPENMP "ON")
|
||||||
|
set(ARM_COMPUTE_ENABLE_WERROR "OFF")
|
||||||
|
set(ARM_COMPUTE_BUILD_EXAMPLES "OFF")
|
||||||
|
set(ARM_COMPUTE_BUILD_TESTING "OFF")
|
||||||
|
|
||||||
set(_cmake_config_cmd
|
set(_cmake_config_cmd
|
||||||
${CMAKE_COMMAND} -G Ninja -B build
|
${CMAKE_COMMAND} -G Ninja -B build
|
||||||
-DARM_COMPUTE_BUILD_SHARED_LIB=OFF
|
-DARM_COMPUTE_BUILD_SHARED_LIB=OFF
|
||||||
@@ -364,7 +375,6 @@ set(VLLM_EXT_SRC
|
|||||||
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||||
set(VLLM_EXT_SRC
|
set(VLLM_EXT_SRC
|
||||||
"csrc/cpu/shm.cpp"
|
"csrc/cpu/shm.cpp"
|
||||||
"csrc/cpu/cpu_wna16.cpp"
|
|
||||||
${VLLM_EXT_SRC})
|
${VLLM_EXT_SRC})
|
||||||
if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI)
|
if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI)
|
||||||
set(VLLM_EXT_SRC
|
set(VLLM_EXT_SRC
|
||||||
|
|||||||
@@ -1,53 +0,0 @@
|
|||||||
# Install OpenAI triton_kernels from https://github.com/triton-lang/triton/tree/main/python/triton_kernels
|
|
||||||
|
|
||||||
set(DEFAULT_TRITON_KERNELS_TAG "v3.5.0")
|
|
||||||
|
|
||||||
# Set TRITON_KERNELS_SRC_DIR for use with local development with vLLM. We expect TRITON_KERNELS_SRC_DIR to
|
|
||||||
# be directly set to the triton_kernels python directory.
|
|
||||||
if (DEFINED ENV{TRITON_KERNELS_SRC_DIR})
|
|
||||||
message(STATUS "[triton_kernels] Fetch from $ENV{TRITON_KERNELS_SRC_DIR}")
|
|
||||||
FetchContent_Declare(
|
|
||||||
triton_kernels
|
|
||||||
SOURCE_DIR $ENV{TRITON_KERNELS_SRC_DIR}
|
|
||||||
)
|
|
||||||
|
|
||||||
else()
|
|
||||||
set(TRITON_GIT "https://github.com/triton-lang/triton.git")
|
|
||||||
message (STATUS "[triton_kernels] Fetch from ${TRITON_GIT}:${DEFAULT_TRITON_KERNELS_TAG}")
|
|
||||||
FetchContent_Declare(
|
|
||||||
triton_kernels
|
|
||||||
# TODO (varun) : Fetch just the triton_kernels directory from Triton
|
|
||||||
GIT_REPOSITORY https://github.com/triton-lang/triton.git
|
|
||||||
GIT_TAG ${DEFAULT_TRITON_KERNELS_TAG}
|
|
||||||
GIT_PROGRESS TRUE
|
|
||||||
SOURCE_SUBDIR python/triton_kernels/triton_kernels
|
|
||||||
)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# Fetch content
|
|
||||||
FetchContent_MakeAvailable(triton_kernels)
|
|
||||||
|
|
||||||
if (NOT triton_kernels_SOURCE_DIR)
|
|
||||||
message (FATAL_ERROR "[triton_kernels] Cannot resolve triton_kernels_SOURCE_DIR")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (DEFINED ENV{TRITON_KERNELS_SRC_DIR})
|
|
||||||
set(TRITON_KERNELS_PYTHON_DIR "${triton_kernels_SOURCE_DIR}/")
|
|
||||||
else()
|
|
||||||
set(TRITON_KERNELS_PYTHON_DIR "${triton_kernels_SOURCE_DIR}/python/triton_kernels/triton_kernels/")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
message (STATUS "[triton_kernels] triton_kernels is available at ${TRITON_KERNELS_PYTHON_DIR}")
|
|
||||||
|
|
||||||
add_custom_target(triton_kernels)
|
|
||||||
|
|
||||||
# Ensure the vllm/third_party directory exists before installation
|
|
||||||
install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/third_party/triton_kernels\")")
|
|
||||||
|
|
||||||
## Copy .py files to install directory.
|
|
||||||
install(DIRECTORY
|
|
||||||
${TRITON_KERNELS_PYTHON_DIR}
|
|
||||||
DESTINATION
|
|
||||||
vllm/third_party/triton_kernels/
|
|
||||||
COMPONENT triton_kernels
|
|
||||||
FILES_MATCHING PATTERN "*.py")
|
|
||||||
@@ -38,7 +38,7 @@ else()
|
|||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
vllm-flash-attn
|
vllm-flash-attn
|
||||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||||
GIT_TAG 86f8f157cf82aa2342743752b97788922dd7de43
|
GIT_TAG 58e0626a692f09241182582659e3bf8f16472659
|
||||||
GIT_PROGRESS TRUE
|
GIT_PROGRESS TRUE
|
||||||
# Don't share the vllm-flash-attn build between build types
|
# Don't share the vllm-flash-attn build between build types
|
||||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||||
|
|||||||
@@ -140,21 +140,16 @@ function(vllm_prepare_torch_gomp_shim TORCH_GOMP_SHIM_DIR)
|
|||||||
run_python(_VLLM_TORCH_GOMP_PATH
|
run_python(_VLLM_TORCH_GOMP_PATH
|
||||||
"
|
"
|
||||||
import os, glob
|
import os, glob
|
||||||
import torch
|
try:
|
||||||
torch_pkg = os.path.dirname(torch.__file__)
|
import torch
|
||||||
site_root = os.path.dirname(torch_pkg)
|
torch_pkg = os.path.dirname(torch.__file__)
|
||||||
|
site_root = os.path.dirname(torch_pkg)
|
||||||
# Search both torch.libs and torch/lib
|
torch_libs = os.path.join(site_root, 'torch.libs')
|
||||||
roots = [os.path.join(site_root, 'torch.libs'), os.path.join(torch_pkg, 'lib')]
|
print(glob.glob(os.path.join(torch_libs, 'libgomp-*.so*'))[0])
|
||||||
candidates = []
|
except:
|
||||||
for root in roots:
|
print('')
|
||||||
if not os.path.isdir(root):
|
|
||||||
continue
|
|
||||||
candidates.extend(glob.glob(os.path.join(root, 'libgomp*.so*')))
|
|
||||||
|
|
||||||
print(candidates[0] if candidates else '')
|
|
||||||
"
|
"
|
||||||
"failed to probe for libgomp")
|
"failed to probe torch.libs for libgomp")
|
||||||
|
|
||||||
if(_VLLM_TORCH_GOMP_PATH STREQUAL "" OR NOT EXISTS "${_VLLM_TORCH_GOMP_PATH}")
|
if(_VLLM_TORCH_GOMP_PATH STREQUAL "" OR NOT EXISTS "${_VLLM_TORCH_GOMP_PATH}")
|
||||||
return()
|
return()
|
||||||
@@ -500,13 +495,7 @@ function (define_extension_target MOD_NAME)
|
|||||||
set(SOABI_KEYWORD "")
|
set(SOABI_KEYWORD "")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
run_python(IS_FREETHREADED_PYTHON
|
if (ARG_USE_SABI)
|
||||||
"import sysconfig; print(1 if sysconfig.get_config_var(\"Py_GIL_DISABLED\") else 0)"
|
|
||||||
"Failed to determine whether interpreter is free-threaded")
|
|
||||||
|
|
||||||
# Free-threaded Python doesn't yet support the stable ABI (see PEP 803/809),
|
|
||||||
# so avoid using the stable ABI under free-threading only.
|
|
||||||
if (ARG_USE_SABI AND NOT IS_FREETHREADED_PYTHON)
|
|
||||||
Python_add_library(${MOD_NAME} MODULE USE_SABI ${ARG_USE_SABI} ${SOABI_KEYWORD} "${ARG_SOURCES}")
|
Python_add_library(${MOD_NAME} MODULE USE_SABI ${ARG_USE_SABI} ${SOABI_KEYWORD} "${ARG_SOURCES}")
|
||||||
else()
|
else()
|
||||||
Python_add_library(${MOD_NAME} MODULE ${SOABI_KEYWORD} "${ARG_SOURCES}")
|
Python_add_library(${MOD_NAME} MODULE ${SOABI_KEYWORD} "${ARG_SOURCES}")
|
||||||
|
|||||||
@@ -16,8 +16,7 @@ __global__ void merge_attn_states_kernel(
|
|||||||
scalar_t* output, float* output_lse, const scalar_t* prefix_output,
|
scalar_t* output, float* output_lse, const scalar_t* prefix_output,
|
||||||
const float* prefix_lse, const scalar_t* suffix_output,
|
const float* prefix_lse, const scalar_t* suffix_output,
|
||||||
const float* suffix_lse, const uint num_tokens, const uint num_heads,
|
const float* suffix_lse, const uint num_tokens, const uint num_heads,
|
||||||
const uint head_size, const uint prefix_head_stride,
|
const uint head_size) {
|
||||||
const uint output_head_stride) {
|
|
||||||
using pack_128b_t = uint4;
|
using pack_128b_t = uint4;
|
||||||
const uint pack_size = 16 / sizeof(scalar_t);
|
const uint pack_size = 16 / sizeof(scalar_t);
|
||||||
const uint threads_per_head = head_size / pack_size;
|
const uint threads_per_head = head_size / pack_size;
|
||||||
@@ -35,13 +34,11 @@ __global__ void merge_attn_states_kernel(
|
|||||||
const uint head_idx = token_head_idx % num_heads;
|
const uint head_idx = token_head_idx % num_heads;
|
||||||
|
|
||||||
const uint pack_offset = pack_idx * pack_size; // (0~15)*8, etc.
|
const uint pack_offset = pack_idx * pack_size; // (0~15)*8, etc.
|
||||||
const uint src_head_offset = token_idx * num_heads * prefix_head_stride +
|
const uint head_offset =
|
||||||
head_idx * prefix_head_stride;
|
token_idx * num_heads * head_size + head_idx * head_size;
|
||||||
const uint dst_head_offset = token_idx * num_heads * output_head_stride +
|
const scalar_t* prefix_head_ptr = prefix_output + head_offset;
|
||||||
head_idx * output_head_stride;
|
const scalar_t* suffix_head_ptr = suffix_output + head_offset;
|
||||||
const scalar_t* prefix_head_ptr = prefix_output + src_head_offset;
|
scalar_t* output_head_ptr = output + head_offset;
|
||||||
const scalar_t* suffix_head_ptr = suffix_output + src_head_offset;
|
|
||||||
scalar_t* output_head_ptr = output + dst_head_offset;
|
|
||||||
|
|
||||||
float p_lse = prefix_lse[head_idx * num_tokens + token_idx];
|
float p_lse = prefix_lse[head_idx * num_tokens + token_idx];
|
||||||
float s_lse = suffix_lse[head_idx * num_tokens + token_idx];
|
float s_lse = suffix_lse[head_idx * num_tokens + token_idx];
|
||||||
@@ -143,7 +140,7 @@ __global__ void merge_attn_states_kernel(
|
|||||||
reinterpret_cast<float*>(prefix_lse.data_ptr()), \
|
reinterpret_cast<float*>(prefix_lse.data_ptr()), \
|
||||||
reinterpret_cast<scalar_t*>(suffix_output.data_ptr()), \
|
reinterpret_cast<scalar_t*>(suffix_output.data_ptr()), \
|
||||||
reinterpret_cast<float*>(suffix_lse.data_ptr()), num_tokens, \
|
reinterpret_cast<float*>(suffix_lse.data_ptr()), num_tokens, \
|
||||||
num_heads, head_size, prefix_head_stride, output_head_stride); \
|
num_heads, head_size); \
|
||||||
}
|
}
|
||||||
|
|
||||||
/*@brief Merges the attention states from prefix and suffix
|
/*@brief Merges the attention states from prefix and suffix
|
||||||
@@ -169,11 +166,17 @@ void merge_attn_states_launcher(torch::Tensor& output,
|
|||||||
const uint num_tokens = output.size(0);
|
const uint num_tokens = output.size(0);
|
||||||
const uint num_heads = output.size(1);
|
const uint num_heads = output.size(1);
|
||||||
const uint head_size = output.size(2);
|
const uint head_size = output.size(2);
|
||||||
const uint prefix_head_stride = prefix_output.stride(1);
|
|
||||||
const uint output_head_stride = output.stride(1);
|
|
||||||
const uint pack_size = 16 / sizeof(scalar_t);
|
const uint pack_size = 16 / sizeof(scalar_t);
|
||||||
TORCH_CHECK(head_size % pack_size == 0,
|
TORCH_CHECK(head_size % pack_size == 0,
|
||||||
"headsize must be multiple of pack_size:", pack_size);
|
"headsize must be multiple of pack_size:", pack_size);
|
||||||
|
TORCH_CHECK(output.stride(-2) == head_size && output.stride(-1) == 1,
|
||||||
|
"output heads must be contiguous in memory");
|
||||||
|
TORCH_CHECK(
|
||||||
|
prefix_output.stride(-2) == head_size && prefix_output.stride(-1) == 1,
|
||||||
|
"prefix_output heads must be contiguous in memory");
|
||||||
|
TORCH_CHECK(
|
||||||
|
suffix_output.stride(-2) == head_size && suffix_output.stride(-1) == 1,
|
||||||
|
"suffix_output heads must be contiguous in memory");
|
||||||
float* output_lse_ptr = nullptr;
|
float* output_lse_ptr = nullptr;
|
||||||
if (output_lse.has_value()) {
|
if (output_lse.has_value()) {
|
||||||
output_lse_ptr = output_lse.value().data_ptr<float>();
|
output_lse_ptr = output_lse.value().data_ptr<float>();
|
||||||
|
|||||||
21
csrc/cache.h
21
csrc/cache.h
@@ -1,7 +1,6 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <torch/all.h>
|
#include <torch/all.h>
|
||||||
#include <c10/util/Optional.h>
|
|
||||||
|
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
@@ -42,12 +41,11 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
|
|||||||
const double scale, const std::string& kv_cache_dtype);
|
const double scale, const std::string& kv_cache_dtype);
|
||||||
|
|
||||||
void gather_and_maybe_dequant_cache(
|
void gather_and_maybe_dequant_cache(
|
||||||
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
|
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
|
||||||
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
|
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
|
||||||
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
||||||
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
||||||
torch::Tensor const& token_to_seq, // [MAX_TOKEN_ACROSS_CHUNKS]
|
int64_t batch_size, const std::string& kv_cache_dtype,
|
||||||
int64_t num_tokens, const std::string& kv_cache_dtype,
|
|
||||||
torch::Tensor const& scale,
|
torch::Tensor const& scale,
|
||||||
std::optional<torch::Tensor> seq_starts = std::nullopt);
|
std::optional<torch::Tensor> seq_starts = std::nullopt);
|
||||||
|
|
||||||
@@ -59,15 +57,6 @@ void cp_gather_cache(
|
|||||||
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
||||||
int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
|
int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
|
||||||
|
|
||||||
// Gather and upconvert FP8 KV cache to BF16 workspace
|
|
||||||
void cp_gather_and_upconvert_fp8_kv_cache(
|
|
||||||
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, 656]
|
|
||||||
torch::Tensor const& dst, // [TOT_TOKENS, 576]
|
|
||||||
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
|
||||||
torch::Tensor const& seq_lens, // [BATCH]
|
|
||||||
torch::Tensor const& workspace_starts, // [BATCH]
|
|
||||||
int64_t batch_size);
|
|
||||||
|
|
||||||
// Indexer K quantization and cache function
|
// Indexer K quantization and cache function
|
||||||
void indexer_k_quant_and_cache(
|
void indexer_k_quant_and_cache(
|
||||||
torch::Tensor& k, // [num_tokens, head_dim]
|
torch::Tensor& k, // [num_tokens, head_dim]
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
#include <ATen/cuda/CUDAContext.h>
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
#include <c10/cuda/CUDAException.h>
|
#include <c10/cuda/CUDAException.h>
|
||||||
#include <c10/util/Optional.h>
|
|
||||||
|
|
||||||
#include "cuda_utils.h"
|
#include "cuda_utils.h"
|
||||||
#include "cuda_compat.h"
|
#include "cuda_compat.h"
|
||||||
@@ -515,8 +514,7 @@ __global__ void indexer_k_quant_and_cache_kernel(
|
|||||||
const int quant_block_size, // quantization block size
|
const int quant_block_size, // quantization block size
|
||||||
const int cache_block_size, // cache block size
|
const int cache_block_size, // cache block size
|
||||||
const int cache_stride, // stride for each token in kv_cache
|
const int cache_stride, // stride for each token in kv_cache
|
||||||
|
const bool use_ue8m0 // use ue8m0 scale format
|
||||||
const bool use_ue8m0 // use ue8m0 scale format
|
|
||||||
) {
|
) {
|
||||||
constexpr int VEC_SIZE = 4;
|
constexpr int VEC_SIZE = 4;
|
||||||
const int64_t token_idx = blockIdx.x;
|
const int64_t token_idx = blockIdx.x;
|
||||||
@@ -554,11 +552,7 @@ __global__ void indexer_k_quant_and_cache_kernel(
|
|||||||
#ifndef USE_ROCM
|
#ifndef USE_ROCM
|
||||||
__syncwarp();
|
__syncwarp();
|
||||||
#endif
|
#endif
|
||||||
#if defined(__gfx942__)
|
|
||||||
float scale = fmaxf(amax, 1e-4) / 224.0f;
|
|
||||||
#else
|
|
||||||
float scale = fmaxf(amax, 1e-4) / 448.0f;
|
float scale = fmaxf(amax, 1e-4) / 448.0f;
|
||||||
#endif
|
|
||||||
if (use_ue8m0) {
|
if (use_ue8m0) {
|
||||||
scale = exp2f(ceilf(log2f(scale)));
|
scale = exp2f(ceilf(log2f(scale)));
|
||||||
}
|
}
|
||||||
@@ -907,80 +901,87 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
|
|||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
// grid is launched with dimensions (batch, num_splits)
|
// grid is launched with dimensions (batch, num_splits)
|
||||||
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt,
|
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
||||||
int ENTRY_SIZE, int CTA_SIZE>
|
|
||||||
__global__ void gather_and_maybe_dequant_cache(
|
__global__ void gather_and_maybe_dequant_cache(
|
||||||
const cache_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE,
|
const cache_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE,
|
||||||
// ENTRIES...]
|
// ENTRIES...]
|
||||||
scalar_t* __restrict__ dst, // [TOT_TOKENS, ENTRIES...]
|
scalar_t* __restrict__ dst, // [TOT_TOKENS, ENTRIES...]
|
||||||
const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES]
|
const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES]
|
||||||
const int32_t* __restrict__ cu_seq_lens, // [BATCH+1]
|
const int32_t* __restrict__ cu_seq_lens, // [BATCH+1]
|
||||||
const int32_t* __restrict__ token_to_seq, // [MAX_TOKEN_ACROSS_CHUNK]
|
const int32_t block_size, const int32_t entry_size,
|
||||||
const int32_t num_tokens, const int32_t block_size,
|
|
||||||
const int64_t block_table_stride, const int64_t cache_block_stride,
|
const int64_t block_table_stride, const int64_t cache_block_stride,
|
||||||
const int64_t cache_entry_stride, const int64_t dst_entry_stride,
|
const int64_t cache_entry_stride, const int64_t dst_entry_stride,
|
||||||
const float* __restrict__ scale,
|
const float* __restrict__ scale,
|
||||||
const int32_t* __restrict__ seq_starts) { // Optional: starting offsets per
|
const int32_t* __restrict__ seq_starts) { // Optional: starting offsets per
|
||||||
// batch
|
// batch
|
||||||
constexpr int vec_size = sizeof(float4) / sizeof(scalar_t);
|
|
||||||
using ltype = vllm::vec_n_t<cache_t, vec_size>;
|
|
||||||
using stype = vllm::vec_n_t<scalar_t, vec_size>;
|
|
||||||
// We are adding this for code readability which will be optimized out when
|
|
||||||
// build in release.
|
|
||||||
assert(CTA_SIZE == blockDim.x);
|
|
||||||
|
|
||||||
#pragma unroll
|
const int64_t bid = blockIdx.x; // Batch ID
|
||||||
for (int token_id = blockIdx.x; token_id < num_tokens;
|
const int32_t num_splits = gridDim.y;
|
||||||
token_id += gridDim.x) {
|
const int32_t split = blockIdx.y;
|
||||||
int64_t batch_id = token_to_seq[token_id];
|
const int32_t seq_start = cu_seq_lens[bid];
|
||||||
int64_t batch_start = cu_seq_lens[batch_id];
|
const int32_t seq_end = cu_seq_lens[bid + 1];
|
||||||
int64_t batch_end = cu_seq_lens[batch_id + 1];
|
const int32_t seq_len = seq_end - seq_start;
|
||||||
int32_t batch_offset = token_id - batch_start;
|
const int32_t tot_blocks = cuda_utils::ceil_div(seq_len, block_size);
|
||||||
|
const int32_t split_blocks = cuda_utils::ceil_div(tot_blocks, num_splits);
|
||||||
|
|
||||||
if (token_id >= batch_end) return;
|
const int32_t split_start = split * split_blocks;
|
||||||
int32_t offset = 0;
|
const int32_t split_end = min((split + 1) * split_blocks, tot_blocks);
|
||||||
if (seq_starts != nullptr) {
|
|
||||||
offset = seq_starts[batch_id];
|
|
||||||
}
|
|
||||||
batch_offset += offset;
|
|
||||||
int32_t block_table_id = batch_offset / block_size;
|
|
||||||
int32_t slot_id = batch_offset % block_size;
|
|
||||||
int32_t block_table_offset = batch_id * block_table_stride + block_table_id;
|
|
||||||
int32_t block_id = block_table[block_table_offset];
|
|
||||||
int64_t cache_offset =
|
|
||||||
block_id * cache_block_stride + slot_id * cache_entry_stride;
|
|
||||||
constexpr int32_t vec_iter_cnt = ENTRY_SIZE / vec_size;
|
|
||||||
scalar_t* dst_ = dst + token_id * dst_entry_stride;
|
|
||||||
cache_t* src_ = const_cast<cache_t*>(src_cache) + cache_offset;
|
|
||||||
|
|
||||||
#pragma unroll
|
const bool is_active_split = (split_start < tot_blocks);
|
||||||
for (int idx = threadIdx.x; idx < vec_iter_cnt; idx += CTA_SIZE) {
|
const bool is_last_split = (split_end == tot_blocks);
|
||||||
|
|
||||||
|
if (!is_active_split) return;
|
||||||
|
|
||||||
|
int32_t full_blocks_end = split_end;
|
||||||
|
int32_t partial_block_size = 0;
|
||||||
|
|
||||||
|
// Adjust the pointer for the block_table for this batch.
|
||||||
|
// If seq_starts is provided, compute an offset based on (seq_starts[bid] /
|
||||||
|
// page_size)
|
||||||
|
const int32_t batch_offset = bid * block_table_stride;
|
||||||
|
int32_t offset = 0;
|
||||||
|
if (seq_starts != nullptr) {
|
||||||
|
offset = seq_starts[bid] / block_size;
|
||||||
|
}
|
||||||
|
const int32_t* batch_block_table = block_table + batch_offset + offset;
|
||||||
|
|
||||||
|
// Adjust dst pointer based on the cumulative sequence lengths.
|
||||||
|
dst += seq_start * dst_entry_stride;
|
||||||
|
|
||||||
|
if (is_last_split) {
|
||||||
|
partial_block_size = seq_len % block_size;
|
||||||
|
if (partial_block_size) full_blocks_end -= 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto copy_entry = [&](const cache_t* __restrict__ _src,
|
||||||
|
scalar_t* __restrict__ _dst) {
|
||||||
|
for (int i = threadIdx.x; i < entry_size; i += blockDim.x) {
|
||||||
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
||||||
reinterpret_cast<stype*>(dst_)[idx] =
|
_dst[i] = static_cast<scalar_t>(_src[i]);
|
||||||
static_cast<stype>(reinterpret_cast<ltype*>(src_)[idx]);
|
|
||||||
} else {
|
} else {
|
||||||
ltype loaded_val = reinterpret_cast<ltype*>(src_)[idx];
|
_dst[i] =
|
||||||
stype store_val;
|
fp8::scaled_convert<scalar_t, cache_t, kv_dt>(_src[i], *scale);
|
||||||
#pragma unroll
|
|
||||||
for (int j = 0; j < vec_size; ++j) {
|
|
||||||
store_val.val[j] = fp8::scaled_convert<scalar_t, cache_t, kv_dt>(
|
|
||||||
loaded_val.val[j], *scale);
|
|
||||||
}
|
|
||||||
reinterpret_cast<stype*>(dst_)[idx] = store_val;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// process tail
|
};
|
||||||
constexpr int32_t tail_cnt = ENTRY_SIZE % vec_size;
|
|
||||||
dst_ = dst_ + ENTRY_SIZE - tail_cnt;
|
for (int pid = split_start; pid < full_blocks_end; ++pid) {
|
||||||
src_ = src_ + ENTRY_SIZE - tail_cnt;
|
auto block_id = batch_block_table[pid];
|
||||||
#pragma unroll
|
auto block_start_ptr = src_cache + block_id * cache_block_stride;
|
||||||
for (int idx = threadIdx.x; idx < tail_cnt; idx += CTA_SIZE) {
|
auto block_dst_ptr = dst + pid * block_size * dst_entry_stride;
|
||||||
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
for (int eid = 0; eid < block_size; ++eid) {
|
||||||
dst_[idx] = static_cast<scalar_t>(src_[idx]);
|
copy_entry(block_start_ptr + eid * cache_entry_stride,
|
||||||
} else {
|
block_dst_ptr + eid * dst_entry_stride);
|
||||||
dst_[idx] =
|
}
|
||||||
fp8::scaled_convert<scalar_t, cache_t, kv_dt>(src_[idx], *scale);
|
}
|
||||||
}
|
|
||||||
|
if (partial_block_size) {
|
||||||
|
auto block_id = batch_block_table[full_blocks_end];
|
||||||
|
auto block_start_ptr = src_cache + block_id * cache_block_stride;
|
||||||
|
auto block_dst_ptr = dst + full_blocks_end * block_size * dst_entry_stride;
|
||||||
|
for (int eid = 0; eid < partial_block_size; ++eid) {
|
||||||
|
copy_entry(block_start_ptr + eid * cache_entry_stride,
|
||||||
|
block_dst_ptr + eid * dst_entry_stride);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -991,38 +992,34 @@ __global__ void gather_and_maybe_dequant_cache(
|
|||||||
// SCALAR_T is the data type of the destination tensor.
|
// SCALAR_T is the data type of the destination tensor.
|
||||||
// CACHE_T is the stored data type of kv-cache.
|
// CACHE_T is the stored data type of kv-cache.
|
||||||
// KV_DTYPE is the real data type of kv-cache.
|
// KV_DTYPE is the real data type of kv-cache.
|
||||||
#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE) \
|
#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE) \
|
||||||
vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE, 576, \
|
vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE> \
|
||||||
thread_block_size> \
|
<<<grid, block, 0, stream>>>( \
|
||||||
<<<grid, block, 0, stream>>>( \
|
reinterpret_cast<CACHE_T*>(src_cache.data_ptr()), \
|
||||||
reinterpret_cast<CACHE_T*>(src_cache.data_ptr()), \
|
reinterpret_cast<SCALAR_T*>(dst.data_ptr()), \
|
||||||
reinterpret_cast<SCALAR_T*>(dst.data_ptr()), \
|
block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
|
||||||
block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
|
block_size, entry_size, block_table_stride, cache_block_stride, \
|
||||||
token_to_seq.data_ptr<int32_t>(), num_tokens, block_size, \
|
cache_entry_stride, dst_entry_stride, \
|
||||||
block_table_stride, cache_block_stride, cache_entry_stride, \
|
reinterpret_cast<const float*>(scale.data_ptr()), seq_starts_ptr);
|
||||||
dst_entry_stride, reinterpret_cast<const float*>(scale.data_ptr()), \
|
|
||||||
seq_starts_ptr);
|
|
||||||
|
|
||||||
// Gather sequences from the cache into the destination tensor.
|
// Gather sequences from the cache into the destination tensor.
|
||||||
// - cu_seq_lens contains the cumulative sequence lengths for each batch
|
// - cu_seq_lens contains the cumulative sequence lengths for each batch
|
||||||
// - block_table contains the cache block indices for each sequence
|
// - block_table contains the cache block indices for each sequence
|
||||||
// - token_to_seq contains the back mapping from token_id to batch_id
|
|
||||||
// - Optionally, seq_starts (if provided) offsets the starting block index by
|
// - Optionally, seq_starts (if provided) offsets the starting block index by
|
||||||
// (seq_starts[bid] / page_size)
|
// (seq_starts[bid] / page_size)
|
||||||
void gather_and_maybe_dequant_cache(
|
void gather_and_maybe_dequant_cache(
|
||||||
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
|
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
|
||||||
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
|
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
|
||||||
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
||||||
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
||||||
torch::Tensor const& token_to_seq, // [MAX_TOKEN_ACROSS_CHUNKS]
|
int64_t batch_size, const std::string& kv_cache_dtype,
|
||||||
int64_t num_tokens, const std::string& kv_cache_dtype,
|
|
||||||
torch::Tensor const& scale,
|
torch::Tensor const& scale,
|
||||||
std::optional<torch::Tensor> seq_starts = std::nullopt) {
|
std::optional<torch::Tensor> seq_starts = std::nullopt) {
|
||||||
at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
|
at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
|
||||||
int32_t block_size = src_cache.size(1);
|
int32_t block_size = src_cache.size(1);
|
||||||
int32_t head_dim = dst.size(-1);
|
int32_t entry_size = src_cache.flatten(2, -1).size(2);
|
||||||
|
|
||||||
TORCH_CHECK(block_table.dtype() == torch::kInt32,
|
TORCH_CHECK(block_table.dtype() == torch::kInt32,
|
||||||
"block_table must be int32");
|
"block_table must be int32");
|
||||||
@@ -1032,9 +1029,6 @@ void gather_and_maybe_dequant_cache(
|
|||||||
TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
|
TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
|
||||||
"seq_starts must be int32");
|
"seq_starts must be int32");
|
||||||
}
|
}
|
||||||
TORCH_CHECK(head_dim == 576,
|
|
||||||
"gather_and_maybe_dequant_cache only support the head_dim to 576 "
|
|
||||||
"for better performance")
|
|
||||||
|
|
||||||
TORCH_CHECK(src_cache.device() == dst.device(),
|
TORCH_CHECK(src_cache.device() == dst.device(),
|
||||||
"src_cache and dst must be on the same device");
|
"src_cache and dst must be on the same device");
|
||||||
@@ -1052,9 +1046,10 @@ void gather_and_maybe_dequant_cache(
|
|||||||
int64_t cache_entry_stride = src_cache.stride(1);
|
int64_t cache_entry_stride = src_cache.stride(1);
|
||||||
int64_t dst_entry_stride = dst.stride(0);
|
int64_t dst_entry_stride = dst.stride(0);
|
||||||
|
|
||||||
constexpr int32_t thread_block_size = 64;
|
// Decide on the number of splits based on the batch size.
|
||||||
dim3 grid(num_tokens);
|
int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
|
||||||
dim3 block(thread_block_size);
|
dim3 grid(batch_size, num_splits);
|
||||||
|
dim3 block(1024);
|
||||||
|
|
||||||
const int32_t* seq_starts_ptr =
|
const int32_t* seq_starts_ptr =
|
||||||
seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;
|
seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;
|
||||||
@@ -1063,82 +1058,6 @@ void gather_and_maybe_dequant_cache(
|
|||||||
}
|
}
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
// Gather and upconvert FP8 KV cache tokens to BF16 workspace
|
|
||||||
// Similar to cp_gather_cache but specifically for FP8->BF16 conversion
|
|
||||||
__global__ void cp_gather_and_upconvert_fp8_kv_cache(
|
|
||||||
const uint8_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE, 656]
|
|
||||||
__nv_bfloat16* __restrict__ dst, // [TOT_TOKENS, 576]
|
|
||||||
const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES]
|
|
||||||
const int32_t* __restrict__ seq_lens, // [BATCH]
|
|
||||||
const int32_t* __restrict__ workspace_starts, // [BATCH]
|
|
||||||
const int32_t block_size, const int32_t head_dim,
|
|
||||||
const int64_t block_table_stride, const int64_t cache_block_stride,
|
|
||||||
const int64_t cache_entry_stride, const int64_t dst_entry_stride) {
|
|
||||||
const int64_t bid = blockIdx.x; // Batch ID
|
|
||||||
const int32_t num_splits = gridDim.y;
|
|
||||||
const int32_t split = blockIdx.y;
|
|
||||||
const int32_t seq_start = workspace_starts[bid];
|
|
||||||
const int32_t seq_len = seq_lens[bid];
|
|
||||||
const int32_t tot_slots = seq_len;
|
|
||||||
const int32_t split_slots = cuda_utils::ceil_div(tot_slots, num_splits);
|
|
||||||
|
|
||||||
const int32_t split_start = split * split_slots;
|
|
||||||
const int32_t split_end = min((split + 1) * split_slots, tot_slots);
|
|
||||||
|
|
||||||
const bool is_active_split = (split_start < tot_slots);
|
|
||||||
|
|
||||||
if (!is_active_split) return;
|
|
||||||
|
|
||||||
// Adjust the pointer for the block_table for this batch
|
|
||||||
const int32_t batch_offset = bid * block_table_stride;
|
|
||||||
int32_t offset = split_start;
|
|
||||||
int32_t offset_div = offset / block_size;
|
|
||||||
offset = offset % block_size;
|
|
||||||
const int32_t* batch_block_table = block_table + batch_offset;
|
|
||||||
|
|
||||||
// Adjust dst pointer based on the cumulative sequence lengths
|
|
||||||
dst += seq_start * dst_entry_stride;
|
|
||||||
|
|
||||||
const int tid = threadIdx.x;
|
|
||||||
|
|
||||||
// Process each token in this split
|
|
||||||
for (int pid = split_start; pid < split_end; ++pid) {
|
|
||||||
auto block_id = batch_block_table[offset_div];
|
|
||||||
const uint8_t* token_ptr =
|
|
||||||
src_cache + block_id * cache_block_stride + offset * cache_entry_stride;
|
|
||||||
__nv_bfloat16* dst_ptr = dst + pid * dst_entry_stride;
|
|
||||||
|
|
||||||
// FP8 format: 512 bytes fp8 + 16 bytes scales + 128 bytes rope (64 bf16)
|
|
||||||
const uint8_t* no_pe_ptr = token_ptr;
|
|
||||||
const float* scales_ptr = reinterpret_cast<const float*>(token_ptr + 512);
|
|
||||||
const __nv_bfloat16* rope_ptr =
|
|
||||||
reinterpret_cast<const __nv_bfloat16*>(token_ptr + 512 + 16);
|
|
||||||
|
|
||||||
// Parallelize fp8 dequant (512 elements) and rope copy (64 elements)
|
|
||||||
if (tid < 512) {
|
|
||||||
// FP8 dequantization
|
|
||||||
const int tile = tid >> 7; // each tile is 128 elements
|
|
||||||
const float scale = scales_ptr[tile];
|
|
||||||
const uint8_t val = no_pe_ptr[tid];
|
|
||||||
dst_ptr[tid] =
|
|
||||||
fp8::scaled_convert<__nv_bfloat16, uint8_t,
|
|
||||||
vllm::Fp8KVCacheDataType::kFp8E4M3>(val, scale);
|
|
||||||
} else if (tid < 576) {
|
|
||||||
// Rope copy (64 bf16 elements)
|
|
||||||
const int rope_idx = tid - 512;
|
|
||||||
dst_ptr[512 + rope_idx] = rope_ptr[rope_idx];
|
|
||||||
}
|
|
||||||
|
|
||||||
// Move to next token
|
|
||||||
offset += 1;
|
|
||||||
if (offset == block_size) {
|
|
||||||
offset_div += 1;
|
|
||||||
offset = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename scalar_t>
|
template <typename scalar_t>
|
||||||
// Note(hc): The cp_gather_cache allows seq_starts to no longer be divisible by
|
// Note(hc): The cp_gather_cache allows seq_starts to no longer be divisible by
|
||||||
// block_size.
|
// block_size.
|
||||||
@@ -1280,57 +1199,6 @@ void cp_gather_cache(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void cp_gather_and_upconvert_fp8_kv_cache(
|
|
||||||
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, 656]
|
|
||||||
torch::Tensor const& dst, // [TOT_TOKENS, 576]
|
|
||||||
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
|
||||||
torch::Tensor const& seq_lens, // [BATCH]
|
|
||||||
torch::Tensor const& workspace_starts, // [BATCH]
|
|
||||||
int64_t batch_size) {
|
|
||||||
at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
|
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
|
||||||
|
|
||||||
int32_t block_size = src_cache.size(1);
|
|
||||||
int32_t head_dim = dst.size(1);
|
|
||||||
|
|
||||||
TORCH_CHECK(block_table.dtype() == torch::kInt32,
|
|
||||||
"block_table must be int32");
|
|
||||||
TORCH_CHECK(seq_lens.dtype() == torch::kInt32, "seq_lens must be int32");
|
|
||||||
TORCH_CHECK(workspace_starts.dtype() == torch::kInt32,
|
|
||||||
"workspace_starts must be int32");
|
|
||||||
|
|
||||||
TORCH_CHECK(src_cache.device() == dst.device(),
|
|
||||||
"src_cache and dst must be on the same device");
|
|
||||||
TORCH_CHECK(src_cache.device() == block_table.device(),
|
|
||||||
"src_cache and block_table must be on the same device");
|
|
||||||
TORCH_CHECK(src_cache.device() == seq_lens.device(),
|
|
||||||
"src_cache and seq_lens must be on the same device");
|
|
||||||
TORCH_CHECK(src_cache.device() == workspace_starts.device(),
|
|
||||||
"src_cache and workspace_starts must be on the same device");
|
|
||||||
|
|
||||||
TORCH_CHECK(src_cache.dtype() == torch::kUInt8, "src_cache must be uint8");
|
|
||||||
TORCH_CHECK(dst.dtype() == torch::kBFloat16, "dst must be bfloat16");
|
|
||||||
TORCH_CHECK(head_dim == 576, "head_dim must be 576 for MLA");
|
|
||||||
|
|
||||||
int64_t block_table_stride = block_table.stride(0);
|
|
||||||
int64_t cache_block_stride = src_cache.stride(0);
|
|
||||||
int64_t cache_entry_stride = src_cache.stride(1);
|
|
||||||
int64_t dst_entry_stride = dst.stride(0);
|
|
||||||
|
|
||||||
// Decide on the number of splits based on the batch size
|
|
||||||
int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
|
|
||||||
dim3 grid(batch_size, num_splits);
|
|
||||||
dim3 block(576);
|
|
||||||
|
|
||||||
vllm::cp_gather_and_upconvert_fp8_kv_cache<<<grid, block, 0, stream>>>(
|
|
||||||
src_cache.data_ptr<uint8_t>(),
|
|
||||||
reinterpret_cast<__nv_bfloat16*>(dst.data_ptr()),
|
|
||||||
block_table.data_ptr<int32_t>(), seq_lens.data_ptr<int32_t>(),
|
|
||||||
workspace_starts.data_ptr<int32_t>(), block_size, head_dim,
|
|
||||||
block_table_stride, cache_block_stride, cache_entry_stride,
|
|
||||||
dst_entry_stride);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Macro to dispatch the kernel based on the data type.
|
// Macro to dispatch the kernel based on the data type.
|
||||||
#define CALL_INDEXER_K_QUANT_AND_CACHE(KV_T, CACHE_T, KV_DTYPE) \
|
#define CALL_INDEXER_K_QUANT_AND_CACHE(KV_T, CACHE_T, KV_DTYPE) \
|
||||||
vllm::indexer_k_quant_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE> \
|
vllm::indexer_k_quant_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE> \
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user