Compare commits
12 Commits
v0.14.0rc0
...
v0.12.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4fd9d6a85c | ||
|
|
a1d627e40f | ||
|
|
2f055ec1c1 | ||
|
|
6a6108511f | ||
|
|
9057fc2f1b | ||
|
|
a05b580540 | ||
|
|
b6ae5aeca6 | ||
|
|
5c7c09af8f | ||
|
|
7f718169d1 | ||
|
|
339e84ce86 | ||
|
|
34a8559be7 | ||
|
|
85fb2e3120 |
@@ -1,24 +0,0 @@
|
||||
name: vllm_ci
|
||||
job_dirs:
|
||||
- ".buildkite/test_areas"
|
||||
- ".buildkite/image_build"
|
||||
run_all_patterns:
|
||||
- "docker/Dockerfile"
|
||||
- "CMakeLists.txt"
|
||||
- "requirements/common.txt"
|
||||
- "requirements/cuda.txt"
|
||||
- "requirements/build.txt"
|
||||
- "requirements/test.txt"
|
||||
- "setup.py"
|
||||
- "csrc/"
|
||||
- "cmake/"
|
||||
run_all_exclude_patterns:
|
||||
- "docker/Dockerfile."
|
||||
- "csrc/cpu/"
|
||||
- "csrc/rocm/"
|
||||
- "cmake/hipify.py"
|
||||
- "cmake/cpu_extension.cmake"
|
||||
registries: public.ecr.aws/q9t5s3a7
|
||||
repositories:
|
||||
main: "vllm-ci-postmerge-repo"
|
||||
premerge: "vllm-ci-test-repo"
|
||||
46
.buildkite/generate_index.py
Normal file
46
.buildkite/generate_index.py
Normal file
@@ -0,0 +1,46 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import os
|
||||
|
||||
template = """<!DOCTYPE html>
|
||||
<html>
|
||||
<body>
|
||||
<h1>Links for vLLM</h1/>
|
||||
<a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
|
||||
<a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--wheel", help="The wheel path.", required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
filename = os.path.basename(args.wheel)
|
||||
|
||||
with open("index.html", "w") as f:
|
||||
print(f"Generated index.html for {args.wheel}")
|
||||
# sync the abi tag with .buildkite/scripts/upload-wheels.sh
|
||||
if "x86_64" in filename:
|
||||
x86_wheel = filename
|
||||
arm_wheel = filename.replace("x86_64", "aarch64").replace(
|
||||
"manylinux1", "manylinux2014"
|
||||
)
|
||||
elif "aarch64" in filename:
|
||||
x86_wheel = filename.replace("aarch64", "x86_64").replace(
|
||||
"manylinux2014", "manylinux1"
|
||||
)
|
||||
arm_wheel = filename
|
||||
else:
|
||||
raise ValueError(f"Unsupported wheel: {filename}")
|
||||
# cloudfront requires escaping the '+' character
|
||||
f.write(
|
||||
template.format(
|
||||
x86_wheel=x86_wheel,
|
||||
x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
|
||||
arm_wheel=arm_wheel,
|
||||
arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
|
||||
)
|
||||
)
|
||||
@@ -1,56 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
if [[ $# -lt 8 ]]; then
|
||||
echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
REGISTRY=$1
|
||||
REPO=$2
|
||||
BUILDKITE_COMMIT=$3
|
||||
BRANCH=$4
|
||||
VLLM_USE_PRECOMPILED=$5
|
||||
VLLM_MERGE_BASE_COMMIT=$6
|
||||
CACHE_FROM=$7
|
||||
CACHE_TO=$8
|
||||
|
||||
# authenticate with AWS ECR
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
||||
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
|
||||
|
||||
# docker buildx
|
||||
docker buildx create --name vllm-builder --driver docker-container --use
|
||||
docker buildx inspect --bootstrap
|
||||
docker buildx ls
|
||||
|
||||
# skip build if image already exists
|
||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
|
||||
echo "Image not found, proceeding with build..."
|
||||
else
|
||||
echo "Image found"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
|
||||
merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
|
||||
else
|
||||
merge_base_commit_build_args=""
|
||||
fi
|
||||
|
||||
# build
|
||||
docker buildx build --file docker/Dockerfile \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
||||
--build-arg USE_SCCACHE=1 \
|
||||
--build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
|
||||
--build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
|
||||
--build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
|
||||
${merge_base_commit_build_args} \
|
||||
--cache-from type=registry,ref=${CACHE_FROM},mode=max \
|
||||
--cache-to type=registry,ref=${CACHE_TO},mode=max \
|
||||
--tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
|
||||
$( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
|
||||
--push \
|
||||
--target test \
|
||||
--progress plain .
|
||||
@@ -1,57 +0,0 @@
|
||||
group: Abuild
|
||||
steps:
|
||||
- label: ":docker: Build image"
|
||||
key: image-build
|
||||
depends_on: []
|
||||
commands:
|
||||
- .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
|
||||
retry:
|
||||
automatic:
|
||||
- exit_status: -1 # Agent was lost
|
||||
limit: 2
|
||||
- exit_status: -10 # Agent was lost
|
||||
limit: 2
|
||||
|
||||
- label: ":docker: Build CPU image"
|
||||
key: image-build-cpu
|
||||
depends_on: []
|
||||
commands:
|
||||
- .buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
retry:
|
||||
automatic:
|
||||
- exit_status: -1 # Agent was lost
|
||||
limit: 2
|
||||
- exit_status: -10 # Agent was lost
|
||||
limit: 2
|
||||
|
||||
- label: ":docker: Build HPU image"
|
||||
soft_fail: true
|
||||
depends_on: []
|
||||
key: image-build-hpu
|
||||
commands:
|
||||
- .buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
retry:
|
||||
automatic:
|
||||
- exit_status: -1 # Agent was lost
|
||||
limit: 2
|
||||
- exit_status: -10 # Agent was lost
|
||||
limit: 2
|
||||
|
||||
- label: ":docker: Build CPU arm64 image"
|
||||
key: cpu-arm64-image-build
|
||||
depends_on: []
|
||||
optional: true
|
||||
commands:
|
||||
- .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
retry:
|
||||
automatic:
|
||||
- exit_status: -1 # Agent was lost
|
||||
limit: 2
|
||||
- exit_status: -10 # Agent was lost
|
||||
limit: 2
|
||||
@@ -1,36 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
if [[ $# -lt 3 ]]; then
|
||||
echo "Usage: $0 <registry> <repo> <commit>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
REGISTRY=$1
|
||||
REPO=$2
|
||||
BUILDKITE_COMMIT=$3
|
||||
|
||||
# authenticate with AWS ECR
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
||||
|
||||
# skip build if image already exists
|
||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
|
||||
echo "Image not found, proceeding with build..."
|
||||
else
|
||||
echo "Image found"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# build
|
||||
docker build --file docker/Dockerfile.cpu \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
||||
--build-arg VLLM_CPU_AVX512BF16=true \
|
||||
--build-arg VLLM_CPU_AVX512VNNI=true \
|
||||
--build-arg VLLM_CPU_AMXBF16=true \
|
||||
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
|
||||
--target vllm-test \
|
||||
--progress plain .
|
||||
|
||||
# push
|
||||
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
|
||||
@@ -1,33 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
if [[ $# -lt 3 ]]; then
|
||||
echo "Usage: $0 <registry> <repo> <commit>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
REGISTRY=$1
|
||||
REPO=$2
|
||||
BUILDKITE_COMMIT=$3
|
||||
|
||||
# authenticate with AWS ECR
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
||||
|
||||
# skip build if image already exists
|
||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
|
||||
echo "Image not found, proceeding with build..."
|
||||
else
|
||||
echo "Image found"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# build
|
||||
docker build --file docker/Dockerfile.cpu \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
||||
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
|
||||
--target vllm-test \
|
||||
--progress plain .
|
||||
|
||||
# push
|
||||
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
|
||||
@@ -1,34 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
if [[ $# -lt 3 ]]; then
|
||||
echo "Usage: $0 <registry> <repo> <commit>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
REGISTRY=$1
|
||||
REPO=$2
|
||||
BUILDKITE_COMMIT=$3
|
||||
|
||||
# authenticate with AWS ECR
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
||||
|
||||
# skip build if image already exists
|
||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
|
||||
echo "Image not found, proceeding with build..."
|
||||
else
|
||||
echo "Image found"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# build
|
||||
docker build \
|
||||
--file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
||||
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
|
||||
--progress plain \
|
||||
https://github.com/vllm-project/vllm-gaudi.git
|
||||
|
||||
# push
|
||||
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
|
||||
@@ -8,4 +8,3 @@ tasks:
|
||||
value: 0.80
|
||||
limit: 250 # will run on 250 * 14 subjects = 3500 samples
|
||||
num_fewshot: 5
|
||||
rtol: 0.05
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
|
||||
@@ -9,40 +9,11 @@ pytest -s -v test_lm_eval_correctness.py \
|
||||
--tp-size=1
|
||||
"""
|
||||
|
||||
import os
|
||||
from contextlib import contextmanager
|
||||
|
||||
import lm_eval
|
||||
import numpy as np
|
||||
import yaml
|
||||
|
||||
DEFAULT_RTOL = 0.08
|
||||
|
||||
|
||||
@contextmanager
|
||||
def scoped_env_vars(new_env: dict[str, str]):
|
||||
if not new_env:
|
||||
# Fast path: nothing to do
|
||||
yield
|
||||
return
|
||||
|
||||
old_values = {}
|
||||
new_keys = []
|
||||
|
||||
try:
|
||||
for key, value in new_env.items():
|
||||
if key in os.environ:
|
||||
old_values[key] = os.environ[key]
|
||||
else:
|
||||
new_keys.append(key)
|
||||
os.environ[key] = str(value)
|
||||
yield
|
||||
finally:
|
||||
# Restore / clean up
|
||||
for key, value in old_values.items():
|
||||
os.environ[key] = value
|
||||
for key in new_keys:
|
||||
os.environ.pop(key, None)
|
||||
RTOL = 0.08
|
||||
|
||||
|
||||
def launch_lm_eval(eval_config, tp_size):
|
||||
@@ -61,26 +32,23 @@ def launch_lm_eval(eval_config, tp_size):
|
||||
f"trust_remote_code={trust_remote_code},"
|
||||
f"max_model_len={max_model_len},"
|
||||
)
|
||||
|
||||
env_vars = eval_config.get("env_vars", None)
|
||||
with scoped_env_vars(env_vars):
|
||||
results = lm_eval.simple_evaluate(
|
||||
model=backend,
|
||||
model_args=model_args,
|
||||
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||
num_fewshot=eval_config["num_fewshot"],
|
||||
limit=eval_config["limit"],
|
||||
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
|
||||
# text models. however, this is regressing measured strict-match for
|
||||
# existing text models in CI, so only apply it for mm, or explicitly set
|
||||
apply_chat_template=eval_config.get(
|
||||
"apply_chat_template", backend == "vllm-vlm"
|
||||
),
|
||||
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
|
||||
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
|
||||
gen_kwargs=eval_config.get("gen_kwargs"),
|
||||
batch_size=batch_size,
|
||||
)
|
||||
results = lm_eval.simple_evaluate(
|
||||
model=backend,
|
||||
model_args=model_args,
|
||||
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||
num_fewshot=eval_config["num_fewshot"],
|
||||
limit=eval_config["limit"],
|
||||
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
|
||||
# text models. however, this is regressing measured strict-match for
|
||||
# existing text models in CI, so only apply it for mm, or explicitly set
|
||||
apply_chat_template=eval_config.get(
|
||||
"apply_chat_template", backend == "vllm-vlm"
|
||||
),
|
||||
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
|
||||
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
|
||||
gen_kwargs=eval_config.get("gen_kwargs"),
|
||||
batch_size=batch_size,
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
@@ -89,8 +57,6 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
|
||||
|
||||
results = launch_lm_eval(eval_config, tp_size)
|
||||
|
||||
rtol = eval_config.get("rtol", DEFAULT_RTOL)
|
||||
|
||||
success = True
|
||||
for task in eval_config["tasks"]:
|
||||
for metric in task["metrics"]:
|
||||
@@ -98,9 +64,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
|
||||
measured_value = results["results"][task["name"]][metric["name"]]
|
||||
print(
|
||||
f"{task['name']} | {metric['name']}: "
|
||||
f"ground_truth={ground_truth:.3f} | "
|
||||
f"measured={measured_value:.3f} | rtol={rtol}"
|
||||
f"ground_truth={ground_truth} | measured={measured_value}"
|
||||
)
|
||||
success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
|
||||
success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
|
||||
|
||||
assert success
|
||||
|
||||
@@ -15,21 +15,6 @@ steps:
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build arm64 wheel - CUDA 13.0"
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cuda-13-0
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# aarch64 build
|
||||
- label: "Build arm64 CPU wheel"
|
||||
depends_on: ~
|
||||
@@ -40,7 +25,7 @@ steps:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
@@ -54,7 +39,7 @@ steps:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_31"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
@@ -67,21 +52,7 @@ steps:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# x86 CPU wheel build
|
||||
- label: "Build x86 CPU wheel"
|
||||
depends_on: ~
|
||||
id: build-wheel-x86-cpu
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
|
||||
@@ -7,21 +7,18 @@
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from dataclasses import asdict, dataclass
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from urllib.parse import quote
|
||||
|
||||
import regex as re
|
||||
|
||||
if not sys.version_info >= (3, 12):
|
||||
raise RuntimeError("This script requires Python 3.12 or higher.")
|
||||
|
||||
INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
|
||||
<html>
|
||||
<!-- {comment} -->
|
||||
<meta name="pypi:repository-version" content="1.0">
|
||||
<body>
|
||||
{items}
|
||||
@@ -92,7 +89,7 @@ def parse_from_filename(file: str) -> WheelFileInfo:
|
||||
)
|
||||
|
||||
|
||||
def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
|
||||
def generate_project_list(subdir_names: list[str]) -> str:
|
||||
"""
|
||||
Generate project list HTML content linking to each project & variant sub-directory.
|
||||
"""
|
||||
@@ -100,14 +97,11 @@ def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
|
||||
for name in sorted(subdir_names):
|
||||
name = name.strip("/").strip(".")
|
||||
href_tags.append(f' <a href="{name}/">{name}/</a><br/>')
|
||||
return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
|
||||
return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
|
||||
|
||||
|
||||
def generate_package_index_and_metadata(
|
||||
wheel_files: list[WheelFileInfo],
|
||||
wheel_base_dir: Path,
|
||||
index_base_dir: Path,
|
||||
comment: str = "",
|
||||
wheel_files: list[WheelFileInfo], wheel_base_dir: Path, index_base_dir: Path
|
||||
) -> tuple[str, str]:
|
||||
"""
|
||||
Generate package index HTML content for a specific package, linking to actual wheel files.
|
||||
@@ -125,7 +119,7 @@ def generate_package_index_and_metadata(
|
||||
file_meta = asdict(file)
|
||||
file_meta["path"] = file_path_quoted
|
||||
metadata.append(file_meta)
|
||||
index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
|
||||
index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
|
||||
metadata_str = json.dumps(metadata, indent=2)
|
||||
return index_str, metadata_str
|
||||
|
||||
@@ -136,7 +130,6 @@ def generate_index_and_metadata(
|
||||
index_base_dir: Path,
|
||||
default_variant: str | None = None,
|
||||
alias_to_default: str | None = None,
|
||||
comment: str = "",
|
||||
):
|
||||
"""
|
||||
Generate index for all wheel files.
|
||||
@@ -147,7 +140,6 @@ def generate_index_and_metadata(
|
||||
index_base_dir (Path): Base directory to store index files.
|
||||
default_variant (str | None): The default variant name, if any.
|
||||
alias_to_default (str | None): Alias variant name for the default variant, if any.
|
||||
comment (str | None): Optional comment to include in the generated HTML files.
|
||||
|
||||
First, parse all wheel files to extract metadata.
|
||||
We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
|
||||
@@ -241,10 +233,6 @@ def generate_index_and_metadata(
|
||||
variant_to_files[alias_to_default] = variant_to_files["default"].copy()
|
||||
print(f"Alias variant '{alias_to_default}' created for default variant.")
|
||||
|
||||
# Generate comment in HTML header
|
||||
comment_str = f" ({comment})" if comment else ""
|
||||
comment_tmpl = f"Generated on {datetime.now().isoformat()}{comment_str}"
|
||||
|
||||
# Generate index for each variant
|
||||
subdir_names = set()
|
||||
for variant, files in variant_to_files.items():
|
||||
@@ -264,7 +252,7 @@ def generate_index_and_metadata(
|
||||
subdir_names = subdir_names.union(packages)
|
||||
else:
|
||||
# generate project list for this variant directly
|
||||
project_list_str = generate_project_list(sorted(packages), comment_tmpl)
|
||||
project_list_str = generate_project_list(sorted(packages))
|
||||
with open(variant_dir / "index.html", "w") as f:
|
||||
f.write(project_list_str)
|
||||
|
||||
@@ -274,7 +262,7 @@ def generate_index_and_metadata(
|
||||
package_dir = variant_dir / package
|
||||
package_dir.mkdir(parents=True, exist_ok=True)
|
||||
index_str, metadata_str = generate_package_index_and_metadata(
|
||||
package_files, wheel_base_dir, package_dir, comment
|
||||
package_files, wheel_base_dir, package_dir
|
||||
)
|
||||
with open(package_dir / "index.html", "w") as f:
|
||||
f.write(index_str)
|
||||
@@ -282,7 +270,7 @@ def generate_index_and_metadata(
|
||||
f.write(metadata_str)
|
||||
|
||||
# Generate top-level project list index
|
||||
project_list_str = generate_project_list(sorted(subdir_names), comment_tmpl)
|
||||
project_list_str = generate_project_list(sorted(subdir_names))
|
||||
with open(index_base_dir / "index.html", "w") as f:
|
||||
f.write(project_list_str)
|
||||
|
||||
@@ -294,7 +282,6 @@ if __name__ == "__main__":
|
||||
--current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
|
||||
--output-dir <output_directory> : directory to store generated index files
|
||||
--alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
|
||||
--comment <comment_string> : (optional) comment string to include in generated HTML files
|
||||
"""
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
@@ -324,12 +311,6 @@ if __name__ == "__main__":
|
||||
default=None,
|
||||
help="Alias variant name for the default variant",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--comment",
|
||||
type=str,
|
||||
default="",
|
||||
help="Optional comment string to include in generated HTML files",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -372,17 +353,6 @@ if __name__ == "__main__":
|
||||
|
||||
print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
|
||||
|
||||
# keep only "official" files for a non-nightly version (specifed by cli args)
|
||||
PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$")
|
||||
if PY_VERSION_RE.match(version):
|
||||
# upload-wheels.sh ensures no "dev" is in args.version
|
||||
wheel_files = list(
|
||||
filter(lambda x: version in x and "dev" not in x, wheel_files)
|
||||
)
|
||||
print(f"Non-nightly version detected, wheel files used: {wheel_files}")
|
||||
else:
|
||||
print("Nightly version detected, keeping all wheel files.")
|
||||
|
||||
# Generate index and metadata, assuming wheels and indices are stored as:
|
||||
# s3://vllm-wheels/{version}/<wheel files>
|
||||
# s3://vllm-wheels/<anything>/<index files>
|
||||
@@ -395,6 +365,5 @@ if __name__ == "__main__":
|
||||
index_base_dir=index_base_dir,
|
||||
default_variant=None,
|
||||
alias_to_default=args.alias_to_default,
|
||||
comment=args.comment.strip(),
|
||||
)
|
||||
print(f"Successfully generated index and metadata in {output_dir}")
|
||||
|
||||
@@ -141,6 +141,7 @@ if [[ $commands == *" entrypoints/openai "* ]]; then
|
||||
--ignore=entrypoints/openai/test_audio.py \
|
||||
--ignore=entrypoints/openai/test_shutdown.py \
|
||||
--ignore=entrypoints/openai/test_completion.py \
|
||||
--ignore=entrypoints/openai/test_sleep.py \
|
||||
--ignore=entrypoints/openai/test_models.py \
|
||||
--ignore=entrypoints/openai/test_lora_adapters.py \
|
||||
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
|
||||
|
||||
@@ -36,17 +36,11 @@ function cpu_tests() {
|
||||
set -e
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||
|
||||
# Run model tests
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
pytest -x -v -s tests/models/multimodal/generation/test_whisper.py -m cpu_model"
|
||||
|
||||
# Run kernel tests
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
pytest -x -v -s tests/kernels/test_onednn.py
|
||||
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
|
||||
pytest -x -v -s tests/kernels/moe/test_moe.py -k test_cpu_fused_moe_basic"
|
||||
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py"
|
||||
|
||||
# basic online serving
|
||||
docker exec cpu-test bash -c '
|
||||
|
||||
@@ -50,7 +50,6 @@ function cpu_tests() {
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
|
||||
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
|
||||
pytest -x -v -s tests/kernels/test_onednn.py"
|
||||
|
||||
# Run basic model test
|
||||
|
||||
@@ -74,7 +74,6 @@ FROM ${BASE_IMAGE_NAME}
|
||||
|
||||
# Define environments
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV SOC_VERSION="ascend910b1"
|
||||
|
||||
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
|
||||
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
|
||||
|
||||
@@ -38,8 +38,7 @@ docker run \
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||
python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
|
||||
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||
cd tests
|
||||
pytest -v -s v1/core
|
||||
pytest -v -s v1/engine
|
||||
@@ -47,6 +46,6 @@ docker run \
|
||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
||||
pytest -v -s v1/structured_output
|
||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
|
||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
|
||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
|
||||
pytest -v -s v1/test_serial_utils.py
|
||||
'
|
||||
|
||||
@@ -12,11 +12,6 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
||||
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
|
||||
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
|
||||
|
||||
if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
|
||||
echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Setting up Prime-RL integration test environment..."
|
||||
|
||||
# Clean up any existing Prime-RL directory
|
||||
|
||||
@@ -44,10 +44,10 @@ trap cleanup EXIT
|
||||
|
||||
for BACK in "${BACKENDS[@]}"; do
|
||||
VLLM_DEEP_GEMM_WARMUP=skip \
|
||||
VLLM_ALL2ALL_BACKEND=$BACK \
|
||||
vllm serve "$MODEL" \
|
||||
--enforce-eager \
|
||||
--enable-eplb \
|
||||
--all2all-backend $BACK \
|
||||
--eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
|
||||
--tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
|
||||
--data-parallel-size ${DATA_PARALLEL_SIZE} \
|
||||
|
||||
@@ -1,74 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euxo pipefail
|
||||
|
||||
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
||||
THRESHOLD=${1:-0.25}
|
||||
NUM_Q=${2:-1319}
|
||||
PORT=${3:-8040}
|
||||
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
||||
mkdir -p "${OUT_DIR}"
|
||||
|
||||
wait_for_server() {
|
||||
local port=$1
|
||||
timeout 600 bash -c '
|
||||
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
||||
sleep 1
|
||||
done'
|
||||
}
|
||||
|
||||
MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
|
||||
|
||||
# Set BACKENDS based on platform
|
||||
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
|
||||
# ROCm platform
|
||||
BACKENDS=("allgather_reducescatter")
|
||||
# Disable MOE padding for ROCm since it is causing eplb to fail
|
||||
export VLLM_ROCM_MOE_PADDING=0
|
||||
else
|
||||
# Non-ROCm platform (CUDA/other)
|
||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||
fi
|
||||
|
||||
cleanup() {
|
||||
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
||||
kill "${SERVER_PID}" 2>/dev/null || true
|
||||
for _ in {1..20}; do
|
||||
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
||||
sleep 0.5
|
||||
done
|
||||
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
||||
fi
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
for BACK in "${BACKENDS[@]}"; do
|
||||
VLLM_DEEP_GEMM_WARMUP=skip \
|
||||
vllm serve "$MODEL" \
|
||||
--enforce-eager \
|
||||
--tensor-parallel-size 4 \
|
||||
--enable-expert-parallel \
|
||||
--enable-eplb \
|
||||
--all2all-backend $BACK \
|
||||
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
|
||||
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
|
||||
--trust-remote-code \
|
||||
--max-model-len 2048 \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--port $PORT &
|
||||
SERVER_PID=$!
|
||||
wait_for_server $PORT
|
||||
|
||||
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
||||
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
||||
python3 - <<PY
|
||||
import json; acc=json.load(open('${OUT}'))['accuracy']
|
||||
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
||||
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
|
||||
PY
|
||||
|
||||
cleanup
|
||||
SERVER_PID=
|
||||
sleep 1
|
||||
PORT=$((PORT+1))
|
||||
done
|
||||
@@ -34,10 +34,9 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
|
||||
fi
|
||||
wheel="${wheel_files[0]}"
|
||||
|
||||
# default build image uses ubuntu 20.04, which corresponds to manylinux_2_31
|
||||
# we also accept params as manylinux tag
|
||||
# current build image uses ubuntu 20.04, which corresponds to manylinux_2_31
|
||||
# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
|
||||
manylinux_version="${1:-manylinux_2_31}"
|
||||
manylinux_version="manylinux_2_31"
|
||||
|
||||
# Rename 'linux' to the appropriate manylinux version in the wheel filename
|
||||
if [[ "$wheel" != *"linux"* ]]; then
|
||||
@@ -82,10 +81,7 @@ else
|
||||
alias_arg=""
|
||||
fi
|
||||
|
||||
# HACK: we do not need regex module here, but it is required by pre-commit hook
|
||||
# To avoid any external dependency, we simply replace it back to the stdlib re module
|
||||
sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
|
||||
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg
|
||||
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
|
||||
|
||||
# copy indices to /<commit>/ unconditionally
|
||||
echo "Uploading indices to $S3_COMMIT_PREFIX"
|
||||
@@ -97,11 +93,8 @@ if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]];
|
||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
|
||||
fi
|
||||
|
||||
# re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version
|
||||
# copy to /<pure_version>/ only if it does not have "dev" in the version
|
||||
if [[ "$version" != *"dev"* ]]; then
|
||||
echo "Re-generating indices for /$pure_version/"
|
||||
rm -rf "$INDICES_OUTPUT_DIR/*"
|
||||
mkdir -p "$INDICES_OUTPUT_DIR"
|
||||
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
|
||||
echo "Uploading indices to overwrite /$pure_version/"
|
||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
|
||||
fi
|
||||
|
||||
@@ -61,8 +61,8 @@ steps:
|
||||
- pytest -v -s -m 'not cpu_test' multimodal
|
||||
- pytest -v -s utils_
|
||||
|
||||
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
|
||||
timeout_in_minutes: 30
|
||||
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
||||
agent_pool: mi325_1
|
||||
grade: Blocking
|
||||
@@ -73,7 +73,6 @@ steps:
|
||||
- tests/multimodal
|
||||
- tests/standalone_tests/lazy_imports.py
|
||||
- tests/tokenizers_
|
||||
- tests/tool_parsers
|
||||
- tests/transformers_utils
|
||||
- tests/config
|
||||
no_gpu: true
|
||||
@@ -83,7 +82,6 @@ steps:
|
||||
- pytest -v -s test_outputs.py
|
||||
- pytest -v -s -m 'cpu_test' multimodal
|
||||
- pytest -v -s tokenizers_
|
||||
- pytest -v -s tool_parsers
|
||||
- pytest -v -s transformers_utils
|
||||
- pytest -v -s config
|
||||
|
||||
@@ -128,7 +126,7 @@ steps:
|
||||
- tests/entrypoints/
|
||||
commands:
|
||||
- pytest -v -s entrypoints/openai/tool_parsers
|
||||
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
|
||||
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
|
||||
|
||||
- label: Entrypoints Integration Test (LLM) # 30min
|
||||
timeout_in_minutes: 40
|
||||
@@ -148,7 +146,7 @@ steps:
|
||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
|
||||
- label: Entrypoints Integration Test (API Server 1) # 100min
|
||||
- label: Entrypoints Integration Test (API Server) # 100min
|
||||
timeout_in_minutes: 130
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_1
|
||||
@@ -162,28 +160,10 @@ steps:
|
||||
- tests/entrypoints/test_chat_utils
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/
|
||||
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
|
||||
- pytest -v -s entrypoints/test_chat_utils.py
|
||||
|
||||
- label: Entrypoints Integration Test (API Server 2)
|
||||
timeout_in_minutes: 50
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
fast_check: true
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/entrypoints/sleep
|
||||
- tests/entrypoints/rpc
|
||||
- tests/tool_use
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/sleep
|
||||
- pytest -v -s tool_use
|
||||
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
|
||||
|
||||
- label: Entrypoints Integration Test (Pooling)
|
||||
timeout_in_minutes: 50
|
||||
mirror_hardwares: [amdexperimental]
|
||||
@@ -346,10 +326,10 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
||||
|
||||
- label: V1 Test e2e + engine # 65min
|
||||
timeout_in_minutes: 90
|
||||
- label: V1 Test e2e + engine # 30min
|
||||
timeout_in_minutes: 45
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_4
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@@ -418,8 +398,7 @@ steps:
|
||||
timeout_in_minutes: 25
|
||||
gpu: h100
|
||||
source_file_dependencies:
|
||||
- vllm/v1/attention
|
||||
- vllm/model_executor/layers
|
||||
- vllm/
|
||||
- tests/v1/determinism/
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
@@ -455,34 +434,29 @@ steps:
|
||||
|
||||
- label: Examples Test # 30min
|
||||
timeout_in_minutes: 45
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/examples"
|
||||
source_file_dependencies:
|
||||
- vllm/entrypoints
|
||||
- vllm/multimodal
|
||||
- examples/
|
||||
commands:
|
||||
- pip install tensorizer # for tensorizer test
|
||||
# for basic
|
||||
- python3 offline_inference/basic/chat.py
|
||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||
- python3 offline_inference/basic/chat.py
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
- python3 offline_inference/audio_language.py --seed 0
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
- python3 offline_inference/vision_language_pooling.py --seed 0
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
- python3 offline_inference/basic/classify.py
|
||||
- python3 offline_inference/basic/embed.py
|
||||
- python3 offline_inference/basic/score.py
|
||||
# for multi-modal models
|
||||
- python3 offline_inference/audio_language.py --seed 0
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
# for pooling models
|
||||
- python3 pooling/pooling/vision_language_pooling.py --seed 0
|
||||
# for features demo
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||
@@ -740,21 +714,19 @@ steps:
|
||||
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
||||
# we can only upgrade after this is resolved
|
||||
# TODO(jerryzh168): resolve the above comment
|
||||
- uv pip install --system torchao==0.14.1
|
||||
- uv pip install --system conch-triton-kernels
|
||||
- uv pip install --system torchao==0.13.0
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||
|
||||
- label: LM Eval Small Models # 53min
|
||||
timeout_in_minutes: 75
|
||||
mirror_hardwares: [amdexperimental]
|
||||
- label: LM Eval Small Models # 15min
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
autorun_on_main: true
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||
|
||||
- label: OpenAI API correctness # 10min
|
||||
timeout_in_minutes: 15
|
||||
@@ -765,10 +737,33 @@ steps:
|
||||
- csrc/
|
||||
- vllm/entrypoints/openai/
|
||||
- vllm/model_executor/models/whisper.py
|
||||
commands: # LMEval+Transcription WER check
|
||||
commands: # LMEval
|
||||
# Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
|
||||
- pytest -s entrypoints/openai/correctness/
|
||||
|
||||
- label: OpenAI-Compatible Tool Use # 23 min
|
||||
timeout_in_minutes: 35
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
fast_check: false
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/tool_use
|
||||
commands:
|
||||
- pytest -v -s -m 'not cpu_test' tool_use
|
||||
|
||||
- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
timeout_in_minutes: 10
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/tool_use
|
||||
no_gpu: true
|
||||
commands:
|
||||
- pytest -v -s -m 'cpu_test' tool_use
|
||||
|
||||
##### models test #####
|
||||
|
||||
@@ -939,18 +934,6 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s models/language/pooling_mteb_test
|
||||
|
||||
- label: Multi-Modal Processor Test (CPU)
|
||||
timeout_in_minutes: 60
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_1
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
no_gpu: true
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
|
||||
|
||||
- label: Multi-Modal Processor Test # 44min
|
||||
timeout_in_minutes: 60
|
||||
mirror_hardwares: [amdexperimental]
|
||||
@@ -978,8 +961,8 @@ steps:
|
||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||
|
||||
- label: Multi-Modal Accuracy Eval (Small Models) # 150min - 180min
|
||||
timeout_in_minutes: 180
|
||||
- label: Multi-Modal Accuracy Eval (Small Models) # 10min
|
||||
timeout_in_minutes: 70
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
@@ -991,8 +974,7 @@ steps:
|
||||
commands:
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 1 # 60min
|
||||
timeout_in_minutes: 120
|
||||
- label: Multi-Modal Models Test (Extended) 1
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
@@ -1016,8 +998,7 @@ steps:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 3 # 75min
|
||||
timeout_in_minutes: 150
|
||||
- label: Multi-Modal Models Test (Extended) 3
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
@@ -1126,6 +1107,7 @@ steps:
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- vllm/model_executor/layers/fused_moe/layer.py
|
||||
- tests/compile/test_fusion_attn.py
|
||||
- tests/compile/test_silu_mul_quant_fusion.py
|
||||
- tests/compile/distributed/test_fusion_all_reduce.py
|
||||
@@ -1159,15 +1141,17 @@ steps:
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
- tests/compile/fullgraph/test_full_graph.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all e2e fusion tests
|
||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
||||
|
||||
- label: Blackwell GPT-OSS Eval
|
||||
- label: ROCm GPT-OSS Eval
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
agent_pool: mi325_1
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
optional: true # run on nightlies
|
||||
source_file_dependencies:
|
||||
- tests/evals/gpt_oss
|
||||
@@ -1176,7 +1160,7 @@ steps:
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
commands:
|
||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||
|
||||
- label: Blackwell Quantized MoE Test
|
||||
timeout_in_minutes: 60
|
||||
@@ -1203,7 +1187,7 @@ steps:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
|
||||
|
||||
##### 1 GPU test #####
|
||||
##### multi gpus test #####
|
||||
@@ -1381,7 +1365,7 @@ steps:
|
||||
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
||||
- pytest -v -s -x lora/test_olmoe_tp.py
|
||||
|
||||
# Disabled for now because MXFP4 backend on non-cuda platform
|
||||
# Disabled for now because MXFP4 backend on non-cuda platform
|
||||
# doesn't support LoRA yet
|
||||
#- pytest -v -s -x lora/test_gptoss_tp.py
|
||||
|
||||
@@ -1447,13 +1431,12 @@ steps:
|
||||
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest -v -s -x lora/test_mixtral.py
|
||||
|
||||
|
||||
- label: LM Eval Large Models # optional
|
||||
gpu: a100
|
||||
optional: true
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
gpu: a100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
@@ -1465,11 +1448,11 @@ steps:
|
||||
|
||||
##### H100 test #####
|
||||
- label: LM Eval Large Models (H100) # optional
|
||||
gpu: h100
|
||||
optional: true
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
@@ -1479,7 +1462,6 @@ steps:
|
||||
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
||||
|
||||
|
||||
##### H200 test #####
|
||||
- label: Distributed Tests (H200) # optional
|
||||
mirror_hardwares: [amdexperimental]
|
||||
@@ -1490,14 +1472,14 @@ steps:
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
|
||||
- pytest -v -s tests/compile/distributed/test_async_tp.py
|
||||
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
#- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
||||
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||
- pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
##### B200 test #####
|
||||
@@ -1511,57 +1493,6 @@ steps:
|
||||
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
##### E2E Eval Tests #####
|
||||
- label: LM Eval Small Models (1 Card) # 15min
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
|
||||
|
||||
- label: LM Eval Large Models (4 Card)
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
gpu: a100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||
|
||||
- label: ROCm LM Eval Large Models (8 Card)
|
||||
mirror_hardwares: [amdproduction]
|
||||
agent_pool: mi325_8
|
||||
num_gpus: 8
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8
|
||||
|
||||
- label: ROCm GPT-OSS Eval
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
agent_pool: mi325_1
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
optional: true # run on nightlies
|
||||
source_file_dependencies:
|
||||
- tests/evals/gpt_oss
|
||||
- vllm/model_executor/models/gpt_oss.py
|
||||
- vllm/model_executor/layers/quantization/mxfp4.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
commands:
|
||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||
|
||||
##### RL Integration Tests #####
|
||||
- label: Prime-RL Integration Test # 15min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
@@ -1576,6 +1507,7 @@ steps:
|
||||
- .buildkite/scripts/run-prime-rl-test.sh
|
||||
commands:
|
||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
||||
|
||||
- label: DeepSeek V2-Lite Accuracy
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
@@ -1607,27 +1539,4 @@ steps:
|
||||
num_gpus: 2
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||
|
||||
- label: DeepSeek V2-Lite Async EPLB Accuracy
|
||||
timeout_in_minutes: 60
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
|
||||
|
||||
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
|
||||
timeout_in_minutes: 60
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||
@@ -57,8 +57,8 @@ steps:
|
||||
- pytest -v -s -m 'not cpu_test' multimodal
|
||||
- pytest -v -s utils_
|
||||
|
||||
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
|
||||
timeout_in_minutes: 30
|
||||
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
|
||||
timeout_in_minutes: 20
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/test_inputs.py
|
||||
@@ -66,7 +66,6 @@ steps:
|
||||
- tests/multimodal
|
||||
- tests/standalone_tests/lazy_imports.py
|
||||
- tests/tokenizers_
|
||||
- tests/tool_parsers
|
||||
- tests/transformers_utils
|
||||
- tests/config
|
||||
no_gpu: true
|
||||
@@ -76,7 +75,6 @@ steps:
|
||||
- pytest -v -s test_outputs.py
|
||||
- pytest -v -s -m 'cpu_test' multimodal
|
||||
- pytest -v -s tokenizers_
|
||||
- pytest -v -s tool_parsers
|
||||
- pytest -v -s transformers_utils
|
||||
- pytest -v -s config
|
||||
|
||||
@@ -114,7 +112,7 @@ steps:
|
||||
- tests/entrypoints/
|
||||
commands:
|
||||
- pytest -v -s entrypoints/openai/tool_parsers
|
||||
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
|
||||
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
|
||||
|
||||
- label: Entrypoints Integration Test (LLM) # 30min
|
||||
timeout_in_minutes: 40
|
||||
@@ -132,7 +130,7 @@ steps:
|
||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
|
||||
- label: Entrypoints Integration Test (API Server 1) # 100min
|
||||
- label: Entrypoints Integration Test (API Server) # 100min
|
||||
timeout_in_minutes: 130
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
@@ -144,26 +142,10 @@ steps:
|
||||
- tests/entrypoints/test_chat_utils
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/
|
||||
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
|
||||
- pytest -v -s entrypoints/test_chat_utils.py
|
||||
|
||||
- label: Entrypoints Integration Test (API Server 2)
|
||||
timeout_in_minutes: 50
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
fast_check: true
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/entrypoints/sleep
|
||||
- tests/entrypoints/rpc
|
||||
- tests/tool_use
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/sleep
|
||||
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
|
||||
- pytest -v -s tool_use
|
||||
|
||||
- label: Entrypoints Integration Test (Pooling)
|
||||
timeout_in_minutes: 50
|
||||
mirror_hardwares: [amdexperimental]
|
||||
@@ -319,10 +301,7 @@ steps:
|
||||
# TODO: accuracy does not match, whether setting
|
||||
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
||||
- pytest -v -s v1/e2e
|
||||
# Run this test standalone for now;
|
||||
# need to untangle use (implicit) use of spawn/fork across the tests.
|
||||
- pytest -v -s v1/engine/test_preprocess_error_handling.py
|
||||
- pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
|
||||
- pytest -v -s v1/engine
|
||||
|
||||
- label: V1 Test entrypoints # 35min
|
||||
timeout_in_minutes: 50
|
||||
@@ -371,8 +350,7 @@ steps:
|
||||
timeout_in_minutes: 25
|
||||
gpu: h100
|
||||
source_file_dependencies:
|
||||
- vllm/v1/attention
|
||||
- vllm/model_executor/layers
|
||||
- vllm/
|
||||
- tests/v1/determinism/
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
@@ -409,28 +387,23 @@ steps:
|
||||
working_dir: "/vllm-workspace/examples"
|
||||
source_file_dependencies:
|
||||
- vllm/entrypoints
|
||||
- vllm/multimodal
|
||||
- examples/
|
||||
commands:
|
||||
- pip install tensorizer # for tensorizer test
|
||||
# for basic
|
||||
- python3 offline_inference/basic/chat.py
|
||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||
- python3 offline_inference/basic/chat.py
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
- python3 offline_inference/audio_language.py --seed 0
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
- python3 offline_inference/vision_language_pooling.py --seed 0
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
- python3 offline_inference/basic/classify.py
|
||||
- python3 offline_inference/basic/embed.py
|
||||
- python3 offline_inference/basic/score.py
|
||||
# for multi-modal models
|
||||
- python3 offline_inference/audio_language.py --seed 0
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
# for pooling models
|
||||
- python3 pooling/pooling/vision_language_pooling.py --seed 0
|
||||
# for features demo
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||
@@ -489,9 +462,7 @@ steps:
|
||||
# tests covered elsewhere.
|
||||
# Use `find` to launch multiple instances of pytest so that
|
||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||
# However, find does not normally propagate error codes, so we combine it with xargs
|
||||
# (using -0 for proper path handling)
|
||||
- "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
|
||||
- "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
|
||||
|
||||
- label: PyTorch Fullgraph Smoke Test # 15min
|
||||
timeout_in_minutes: 30
|
||||
@@ -505,9 +476,7 @@ steps:
|
||||
# as it is a heavy test that is covered in other steps.
|
||||
# Use `find` to launch multiple instances of pytest so that
|
||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||
# However, find does not normally propagate error codes, so we combine it with xargs
|
||||
# (using -0 for proper path handling)
|
||||
- "find compile/fullgraph -maxdepth 1 -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
|
||||
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
|
||||
|
||||
- label: PyTorch Fullgraph Test # 27min
|
||||
timeout_in_minutes: 40
|
||||
@@ -661,7 +630,7 @@ steps:
|
||||
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
||||
# we can only upgrade after this is resolved
|
||||
# TODO(jerryzh168): resolve the above comment
|
||||
- uv pip install --system torchao==0.14.1 --index-url https://download.pytorch.org/whl/cu129
|
||||
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
|
||||
- uv pip install --system conch-triton-kernels
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||
|
||||
@@ -673,7 +642,7 @@ steps:
|
||||
- vllm/model_executor/layers/quantization
|
||||
autorun_on_main: true
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||
|
||||
- label: OpenAI API correctness # 22min
|
||||
timeout_in_minutes: 30
|
||||
@@ -685,6 +654,25 @@ steps:
|
||||
commands: # LMEval+Transcription WER check
|
||||
- pytest -s entrypoints/openai/correctness/
|
||||
|
||||
- label: OpenAI-Compatible Tool Use # 23 min
|
||||
timeout_in_minutes: 35
|
||||
mirror_hardwares: [amdexperimental]
|
||||
fast_check: false
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/tool_use
|
||||
commands:
|
||||
- pytest -v -s -m 'not cpu_test' tool_use
|
||||
|
||||
- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
|
||||
timeout_in_minutes: 10
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/tool_use
|
||||
no_gpu: true
|
||||
commands:
|
||||
- pytest -v -s -m 'cpu_test' tool_use
|
||||
|
||||
##### models test #####
|
||||
|
||||
- label: Basic Models Tests (Initialization)
|
||||
@@ -694,7 +682,6 @@ steps:
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/test_initialization.py
|
||||
- tests/models/registry.py
|
||||
commands:
|
||||
# Run a subset of model initialization tests
|
||||
- pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
|
||||
@@ -707,7 +694,6 @@ steps:
|
||||
- vllm/model_executor/models/
|
||||
- vllm/transformers_utils/
|
||||
- tests/models/test_initialization.py
|
||||
- tests/models/registry.py
|
||||
commands:
|
||||
# Only when vLLM model source is modified - test initialization of a large
|
||||
# subset of supported models (the complement of the small subset in the above
|
||||
@@ -840,7 +826,7 @@ steps:
|
||||
- tests/models/multimodal
|
||||
no_gpu: true
|
||||
commands:
|
||||
- "pip install git+https://github.com/TIGER-AI-Lab/Mantis.git || echo 'Mantis installation skipped (decord not available on CPU-only environment)'"
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
|
||||
|
||||
- label: Multi-Modal Processor Test
|
||||
@@ -1073,7 +1059,7 @@ steps:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
|
||||
|
||||
##### 1 GPU test #####
|
||||
##### multi gpus test #####
|
||||
@@ -1232,8 +1218,6 @@ steps:
|
||||
# FIXIT: find out which code initialize cuda before running the test
|
||||
# before the fix, we need to use spawn to test it
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
# Alot of these tests are on the edge of OOMing
|
||||
- export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
||||
# There is some Tensor Parallelism related processing logic in LoRA that
|
||||
# requires multi-GPU testing for validation.
|
||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||
@@ -1334,7 +1318,7 @@ steps:
|
||||
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
##### B200 test #####
|
||||
@@ -1352,7 +1336,6 @@ steps:
|
||||
- label: Prime-RL Integration Test # 15min
|
||||
timeout_in_minutes: 30
|
||||
optional: true
|
||||
soft_fail: true
|
||||
num_gpus: 2
|
||||
working_dir: "/vllm-workspace"
|
||||
source_file_dependencies:
|
||||
@@ -1386,4 +1369,4 @@ steps:
|
||||
num_gpus: 2
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||
@@ -1,21 +0,0 @@
|
||||
group: Attention
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: V1 attention (H100)
|
||||
timeout_in_minutes: 30
|
||||
gpu: h100
|
||||
source_file_dependencies:
|
||||
- vllm/v1/attention
|
||||
- tests/v1/attention
|
||||
commands:
|
||||
- pytest -v -s v1/attention
|
||||
|
||||
- label: V1 attention (B200)
|
||||
timeout_in_minutes: 30
|
||||
gpu: b200
|
||||
source_file_dependencies:
|
||||
- vllm/v1/attention
|
||||
- tests/v1/attention
|
||||
commands:
|
||||
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
|
||||
@@ -1,16 +0,0 @@
|
||||
group: Basic Correctness
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Basic Correctness
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/basic_correctness/test_basic_correctness
|
||||
- tests/basic_correctness/test_cpu_offload
|
||||
- tests/basic_correctness/test_cumem.py
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s basic_correctness/test_cumem.py
|
||||
- pytest -v -s basic_correctness/test_basic_correctness.py
|
||||
- pytest -v -s basic_correctness/test_cpu_offload.py
|
||||
@@ -1,19 +0,0 @@
|
||||
group: Benchmarks
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Benchmarks
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/.buildkite"
|
||||
source_file_dependencies:
|
||||
- benchmarks/
|
||||
commands:
|
||||
- bash scripts/run-benchmarks.sh
|
||||
|
||||
- label: Benchmarks CLI Test
|
||||
timeout_in_minutes: 20
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/benchmarks/
|
||||
commands:
|
||||
- pytest -v -s benchmarks/
|
||||
@@ -1,57 +0,0 @@
|
||||
group: Compile
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Fusion and Compile Tests (B200)
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/v1/worker/
|
||||
- vllm/v1/cudagraph_dispatcher.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/test_fusion_attn.py
|
||||
- tests/compile/test_silu_mul_quant_fusion.py
|
||||
- tests/compile/distributed/test_fusion_all_reduce.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
- tests/compile/fullgraph/test_full_graph.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||
# Wrap with quotes to escape yaml
|
||||
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
||||
|
||||
- label: Fusion E2E (2 GPUs)(B200)
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all e2e fusion tests
|
||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
||||
|
||||
@@ -1,22 +0,0 @@
|
||||
group: CUDA
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Platform Tests (CUDA)
|
||||
timeout_in_minutes: 15
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/cuda
|
||||
commands:
|
||||
- pytest -v -s cuda/test_cuda_context.py
|
||||
|
||||
- label: Cudagraph
|
||||
timeout_in_minutes: 20
|
||||
source_file_dependencies:
|
||||
- tests/v1/cudagraph
|
||||
- vllm/v1/cudagraph_dispatcher.py
|
||||
- vllm/config/compilation.py
|
||||
- vllm/compilation
|
||||
commands:
|
||||
- pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
|
||||
- pytest -v -s v1/cudagraph/test_cudagraph_mode.py
|
||||
@@ -1,199 +0,0 @@
|
||||
group: Distributed
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Distributed Comm Ops
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- vllm/distributed
|
||||
- tests/distributed
|
||||
commands:
|
||||
- pytest -v -s distributed/test_comm_ops.py
|
||||
- pytest -v -s distributed/test_shm_broadcast.py
|
||||
- pytest -v -s distributed/test_shm_buffer.py
|
||||
- pytest -v -s distributed/test_shm_storage.py
|
||||
|
||||
- label: Distributed (2 GPUs)
|
||||
timeout_in_minutes: 90
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- vllm/compilation/
|
||||
- vllm/distributed/
|
||||
- vllm/engine/
|
||||
- vllm/executor/
|
||||
- vllm/worker/worker_base.py
|
||||
- vllm/v1/engine/
|
||||
- vllm/v1/worker/
|
||||
- tests/compile/fullgraph/test_basic_correctness.py
|
||||
- tests/compile/test_wrapper.py
|
||||
- tests/distributed/
|
||||
- tests/entrypoints/llm/test_collective_rpc.py
|
||||
- tests/v1/distributed
|
||||
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- tests/v1/shutdown
|
||||
- tests/v1/worker/test_worker_memory_snapshot.py
|
||||
commands:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- pytest -v -s distributed/test_sequence_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
||||
|
||||
- label: Distributed Tests (4 GPUs)
|
||||
timeout_in_minutes: 50
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- tests/distributed/test_utils
|
||||
- tests/distributed/test_pynccl
|
||||
- tests/distributed/test_events
|
||||
- tests/compile/fullgraph/test_basic_correctness.py
|
||||
- examples/offline_inference/rlhf.py
|
||||
- examples/offline_inference/rlhf_colocate.py
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
- tests/v1/distributed
|
||||
- tests/v1/engine/test_engine_core_client.py
|
||||
- tests/distributed/test_symm_mem_allreduce.py
|
||||
commands:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
# test with torchrun tp=2 and external_dp=2
|
||||
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
# test with torchrun tp=2 and pp=2
|
||||
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
# test with torchrun tp=4 and dp=1
|
||||
- TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||
# test with torchrun tp=2, pp=2 and dp=1
|
||||
- PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||
# test with torchrun tp=1 and dp=4 with ep
|
||||
- DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||
# test with torchrun tp=2 and dp=2 with ep
|
||||
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||
# test with internal dp
|
||||
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
||||
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
||||
- pytest -v -s distributed/test_utils.py
|
||||
- pytest -v -s compile/fullgraph/test_basic_correctness.py
|
||||
- pytest -v -s distributed/test_pynccl.py
|
||||
- pytest -v -s distributed/test_events.py
|
||||
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
||||
# TODO: create a dedicated test section for multi-GPU example tests
|
||||
# when we have multiple distributed example tests
|
||||
- cd ../examples/offline_inference
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||
|
||||
- label: Distributed Tests (8 GPUs)(H100)
|
||||
timeout_in_minutes: 10
|
||||
gpu: h100
|
||||
num_gpus: 8
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- examples/offline_inference/torchrun_dp_example.py
|
||||
- vllm/config/parallel.py
|
||||
- vllm/distributed/
|
||||
- vllm/v1/engine/llm_engine.py
|
||||
- vllm/v1/executor/uniproc_executor.py
|
||||
- vllm/v1/worker/gpu_worker.py
|
||||
commands:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
# test with torchrun tp=2 and dp=4 with ep
|
||||
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
|
||||
|
||||
- label: Distributed Tests (4 GPUs)(A100)
|
||||
gpu: a100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
commands:
|
||||
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
||||
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
||||
- pytest -v -s distributed/test_custom_all_reduce.py
|
||||
- torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
|
||||
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest -v -s -x lora/test_mixtral.py
|
||||
|
||||
- label: Distributed Tests (2 GPUs)(H200)
|
||||
gpu: h200
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
|
||||
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
- label: Distributed Tests (2 GPUs)(B200)
|
||||
gpu: b200
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
- label: 2 Node Test (4 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
num_nodes: 2
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- vllm/engine/
|
||||
- vllm/executor/
|
||||
- vllm/model_executor/models/
|
||||
- tests/distributed/
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
commands:
|
||||
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
|
||||
|
||||
- label: Distributed NixlConnector PD accuracy (4 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||
- tests/v1/kv_connector/nixl_integration/
|
||||
commands:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
|
||||
|
||||
- label: Pipeline + Context Parallelism (4 GPUs))
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- vllm/engine/
|
||||
- vllm/executor/
|
||||
- vllm/model_executor/models/
|
||||
- tests/distributed/
|
||||
commands:
|
||||
- pytest -v -s distributed/test_pp_cudagraph.py
|
||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||
@@ -1,42 +0,0 @@
|
||||
group: E2E Integration
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: DeepSeek V2-Lite Accuracy
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
|
||||
|
||||
- label: Qwen3-30B-A3B-FP8-block Accuracy
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
|
||||
|
||||
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
|
||||
timeout_in_minutes: 60
|
||||
gpu: b200
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||
|
||||
- label: Prime-RL Integration (2 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
optional: true
|
||||
soft_fail: true
|
||||
num_gpus: 2
|
||||
working_dir: "/vllm-workspace"
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- .buildkite/scripts/run-prime-rl-test.sh
|
||||
commands:
|
||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
||||
@@ -1,26 +0,0 @@
|
||||
group: Engine
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Engine
|
||||
timeout_in_minutes: 15
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/engine
|
||||
- tests/test_sequence
|
||||
- tests/test_config
|
||||
- tests/test_logger
|
||||
- tests/test_vllm_port
|
||||
commands:
|
||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
||||
|
||||
- label: V1 e2e + engine
|
||||
timeout_in_minutes: 45
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
commands:
|
||||
# TODO: accuracy does not match, whether setting
|
||||
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
||||
- pytest -v -s v1/e2e
|
||||
- pytest -v -s v1/engine
|
||||
@@ -1,83 +0,0 @@
|
||||
group: Entrypoints
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Entrypoints Unit Tests
|
||||
timeout_in_minutes: 10
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- vllm/entrypoints
|
||||
- tests/entrypoints/
|
||||
commands:
|
||||
- pytest -v -s entrypoints/openai/tool_parsers
|
||||
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
|
||||
|
||||
- label: Entrypoints Integration (LLM)
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/entrypoints/llm
|
||||
- tests/entrypoints/offline_mode
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
|
||||
- label: Entrypoints Integration (API Server 1)
|
||||
timeout_in_minutes: 130
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/entrypoints/openai
|
||||
- tests/entrypoints/test_chat_utils
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/
|
||||
- pytest -v -s entrypoints/test_chat_utils.py
|
||||
|
||||
|
||||
- label: Entrypoints Integration (API Server 2)
|
||||
timeout_in_minutes: 130
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/tool_use
|
||||
- tests/entrypoints/sleep
|
||||
- tests/entrypoints/instrumentator
|
||||
- tests/entrypoints/rpc
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
|
||||
- pytest -v -s entrypoints/instrumentator
|
||||
- pytest -v -s entrypoints/sleep
|
||||
- pytest -v -s tool_use
|
||||
|
||||
- label: Entrypoints Integration (Pooling)
|
||||
timeout_in_minutes: 50
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/entrypoints/pooling
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/pooling
|
||||
|
||||
|
||||
- label: Entrypoints V1
|
||||
timeout_in_minutes: 50
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
commands:
|
||||
- pytest -v -s v1/entrypoints
|
||||
|
||||
- label: OpenAI API Correctness
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/entrypoints/openai/
|
||||
- vllm/model_executor/models/whisper.py
|
||||
commands: # LMEval+Transcription WER check
|
||||
- pytest -s entrypoints/openai/correctness/
|
||||
@@ -1,23 +0,0 @@
|
||||
group: Expert Parallelism
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: EPLB Algorithm
|
||||
timeout_in_minutes: 15
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/eplb
|
||||
- tests/distributed/test_eplb_algo.py
|
||||
commands:
|
||||
- pytest -v -s distributed/test_eplb_algo.py
|
||||
|
||||
- label: EPLB Execution
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/eplb
|
||||
- tests/distributed/test_eplb_execute.py
|
||||
commands:
|
||||
- pytest -v -s distributed/test_eplb_execute.py
|
||||
- pytest -v -s distributed/test_eplb_spec_decode.py
|
||||
@@ -1,117 +0,0 @@
|
||||
group: Kernels
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Kernels Core Operation Test
|
||||
timeout_in_minutes: 75
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- tests/kernels/core
|
||||
- tests/kernels/test_top_k_per_row.py
|
||||
commands:
|
||||
- pytest -v -s kernels/core kernels/test_top_k_per_row.py
|
||||
|
||||
- label: Kernels Attention Test %N
|
||||
timeout_in_minutes: 35
|
||||
source_file_dependencies:
|
||||
- csrc/attention/
|
||||
- vllm/attention
|
||||
- vllm/v1/attention
|
||||
- tests/kernels/attention
|
||||
commands:
|
||||
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 2
|
||||
|
||||
- label: Kernels Quantization Test %N
|
||||
timeout_in_minutes: 90
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
- vllm/model_executor/layers/quantization
|
||||
- tests/kernels/quantization
|
||||
commands:
|
||||
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 2
|
||||
|
||||
- label: Kernels MoE Test %N
|
||||
timeout_in_minutes: 60
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/cutlass_w8a8/moe/
|
||||
- csrc/moe/
|
||||
- tests/kernels/moe
|
||||
- vllm/model_executor/layers/fused_moe/
|
||||
- vllm/distributed/device_communicators/
|
||||
- vllm/envs.py
|
||||
- vllm/config
|
||||
commands:
|
||||
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 2
|
||||
|
||||
- label: Kernels Mamba Test
|
||||
timeout_in_minutes: 45
|
||||
source_file_dependencies:
|
||||
- csrc/mamba/
|
||||
- tests/kernels/mamba
|
||||
- vllm/model_executor/layers/mamba/ops
|
||||
commands:
|
||||
- pytest -v -s kernels/mamba
|
||||
|
||||
- label: Kernels DeepGEMM Test (H100)
|
||||
timeout_in_minutes: 45
|
||||
gpu: h100
|
||||
num_gpus: 1
|
||||
source_file_dependencies:
|
||||
- tools/install_deepgemm.sh
|
||||
- vllm/utils/deep_gemm.py
|
||||
- vllm/model_executor/layers/fused_moe
|
||||
- vllm/model_executor/layers/quantization
|
||||
- tests/kernels/quantization/test_block_fp8.py
|
||||
- tests/kernels/moe/test_deepgemm.py
|
||||
- tests/kernels/moe/test_batched_deepgemm.py
|
||||
- tests/kernels/attention/test_deepgemm_attention.py
|
||||
commands:
|
||||
- pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
|
||||
- pytest -v -s kernels/moe/test_deepgemm.py
|
||||
- pytest -v -s kernels/moe/test_batched_deepgemm.py
|
||||
- pytest -v -s kernels/attention/test_deepgemm_attention.py
|
||||
|
||||
- label: Kernels (B200)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
# optional: true
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- csrc/attention/mla/
|
||||
- csrc/quantization/cutlass_w8a8/moe/
|
||||
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/v1/attention/backends/mla/cutlass_mla.py
|
||||
- vllm/v1/attention/backends/mla/flashinfer_mla.py
|
||||
- vllm/platforms/cuda.py
|
||||
- vllm/attention/selector.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
# Attention
|
||||
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
||||
- pytest -v -s tests/kernels/attention/test_attention_selector.py
|
||||
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
|
||||
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
|
||||
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
|
||||
- pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
|
||||
# Quantization
|
||||
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
|
||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
|
||||
- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
|
||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
|
||||
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
|
||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
||||
@@ -1,46 +0,0 @@
|
||||
group: LM Eval
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: LM Eval Small Models
|
||||
timeout_in_minutes: 75
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
autorun_on_main: true
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
|
||||
|
||||
- label: LM Eval Large Models (4 GPUs)(A100)
|
||||
gpu: a100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||
|
||||
- label: LM Eval Large Models (4 GPUs)(H100)
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
||||
|
||||
- label: LM Eval Small Models (B200)
|
||||
timeout_in_minutes: 120
|
||||
gpu: b200
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
|
||||
@@ -1,33 +0,0 @@
|
||||
group: LoRA
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: LoRA %N
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/lora
|
||||
- tests/lora
|
||||
commands:
|
||||
- pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py
|
||||
parallelism: 4
|
||||
|
||||
|
||||
- label: LoRA TP (Distributed)
|
||||
timeout_in_minutes: 30
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/lora
|
||||
- tests/lora
|
||||
commands:
|
||||
# FIXIT: find out which code initialize cuda before running the test
|
||||
# before the fix, we need to use spawn to test it
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
# Alot of these tests are on the edge of OOMing
|
||||
- export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
||||
# There is some Tensor Parallelism related processing logic in LoRA that
|
||||
# requires multi-GPU testing for validation.
|
||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||
- pytest -v -s -x lora/test_llama_tp.py
|
||||
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
||||
- pytest -v -s -x lora/test_olmoe_tp.py
|
||||
- pytest -v -s -x lora/test_gptoss_tp.py
|
||||
@@ -1,165 +0,0 @@
|
||||
group: Miscellaneous
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: V1 Others
|
||||
timeout_in_minutes: 60
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
commands:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||
# split the test to avoid interference
|
||||
- pytest -v -s -m 'not cpu_test' v1/core
|
||||
- pytest -v -s v1/executor
|
||||
- pytest -v -s v1/kv_offload
|
||||
- pytest -v -s v1/sample
|
||||
- pytest -v -s v1/logits_processors
|
||||
- pytest -v -s v1/worker
|
||||
- pytest -v -s v1/spec_decode
|
||||
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
|
||||
- pytest -v -s -m 'not cpu_test' v1/metrics
|
||||
- pytest -v -s v1/test_oracle.py
|
||||
- pytest -v -s v1/test_request.py
|
||||
- pytest -v -s v1/test_outputs.py
|
||||
# Integration test for streaming correctness (requires special branch).
|
||||
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||
|
||||
- label: V1 Others (CPU)
|
||||
depends_on: ~
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
no_gpu: true
|
||||
commands:
|
||||
# split the test to avoid interference
|
||||
- pytest -v -s -m 'cpu_test' v1/core
|
||||
- pytest -v -s v1/structured_output
|
||||
- pytest -v -s v1/test_serial_utils.py
|
||||
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
|
||||
- pytest -v -s -m 'cpu_test' v1/metrics
|
||||
|
||||
- label: Regression
|
||||
timeout_in_minutes: 20
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/test_regression
|
||||
commands:
|
||||
- pip install modelscope
|
||||
- pytest -v -s test_regression.py
|
||||
working_dir: "/vllm-workspace/tests" # optional
|
||||
|
||||
- label: Examples
|
||||
timeout_in_minutes: 45
|
||||
working_dir: "/vllm-workspace/examples"
|
||||
source_file_dependencies:
|
||||
- vllm/entrypoints
|
||||
- vllm/multimodal
|
||||
- examples/
|
||||
commands:
|
||||
- pip install tensorizer # for tensorizer test
|
||||
- python3 offline_inference/basic/chat.py # for basic
|
||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||
- python3 offline_inference/basic/classify.py
|
||||
- python3 offline_inference/basic/embed.py
|
||||
- python3 offline_inference/basic/score.py
|
||||
# for multi-modal models
|
||||
- python3 offline_inference/audio_language.py --seed 0
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
# for pooling models
|
||||
- python3 pooling/pooling/vision_language_pooling.py --seed 0
|
||||
# for features demo
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||
|
||||
- label: Metrics, Tracing (2 GPUs)
|
||||
timeout_in_minutes: 20
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1/tracing
|
||||
commands:
|
||||
- "pip install \
|
||||
'opentelemetry-sdk>=1.26.0' \
|
||||
'opentelemetry-api>=1.26.0' \
|
||||
'opentelemetry-exporter-otlp>=1.26.0' \
|
||||
'opentelemetry-semantic-conventions-ai>=0.4.1'"
|
||||
- pytest -v -s v1/tracing
|
||||
|
||||
- label: Python-only Installation
|
||||
depends_on: ~
|
||||
timeout_in_minutes: 20
|
||||
source_file_dependencies:
|
||||
- tests/standalone_tests/python_only_compile.sh
|
||||
- setup.py
|
||||
commands:
|
||||
- bash standalone_tests/python_only_compile.sh
|
||||
|
||||
- label: Async Engine, Inputs, Utils, Worker
|
||||
timeout_in_minutes: 50
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/multimodal
|
||||
- tests/utils_
|
||||
commands:
|
||||
- pytest -v -s -m 'not cpu_test' multimodal
|
||||
- pytest -v -s utils_
|
||||
|
||||
- label: Async Engine, Inputs, Utils, Worker, Config (CPU)
|
||||
depends_on: ~
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/test_inputs.py
|
||||
- tests/test_outputs.py
|
||||
- tests/multimodal
|
||||
- tests/standalone_tests/lazy_imports.py
|
||||
- tests/tokenizers_
|
||||
- tests/tool_parsers
|
||||
- tests/transformers_utils
|
||||
- tests/config
|
||||
no_gpu: true
|
||||
commands:
|
||||
- python3 standalone_tests/lazy_imports.py
|
||||
- pytest -v -s test_inputs.py
|
||||
- pytest -v -s test_outputs.py
|
||||
- pytest -v -s -m 'cpu_test' multimodal
|
||||
- pytest -v -s tokenizers_
|
||||
- pytest -v -s tool_parsers
|
||||
- pytest -v -s transformers_utils
|
||||
- pytest -v -s config
|
||||
|
||||
- label: GPT-OSS Eval (B200)
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- tests/evals/gpt_oss
|
||||
- vllm/model_executor/models/gpt_oss.py
|
||||
- vllm/model_executor/layers/quantization/mxfp4.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
commands:
|
||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||
|
||||
- label: Batch Invariance (H100)
|
||||
timeout_in_minutes: 25
|
||||
gpu: h100
|
||||
source_file_dependencies:
|
||||
- vllm/v1/attention
|
||||
- vllm/model_executor/layers
|
||||
- tests/v1/determinism/
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pip install pytest-timeout pytest-forked
|
||||
- pytest -v -s v1/determinism/test_batch_invariance.py
|
||||
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
|
||||
@@ -1,17 +0,0 @@
|
||||
group: Model Executor
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Model Executor
|
||||
timeout_in_minutes: 35
|
||||
source_file_dependencies:
|
||||
- vllm/engine/arg_utils.py
|
||||
- vllm/config/model.py
|
||||
- vllm/model_executor
|
||||
- tests/model_executor
|
||||
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
commands:
|
||||
- apt-get update && apt-get install -y curl libsodium23
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s model_executor
|
||||
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
@@ -1,64 +0,0 @@
|
||||
group: Models - Basic
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Basic Models Tests (Initialization)
|
||||
timeout_in_minutes: 45
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/test_initialization.py
|
||||
- tests/models/registry.py
|
||||
commands:
|
||||
# Run a subset of model initialization tests
|
||||
- pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
|
||||
|
||||
- label: Basic Models Tests (Extra Initialization) %N
|
||||
timeout_in_minutes: 45
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/models/
|
||||
- tests/models/test_initialization.py
|
||||
- tests/models/registry.py
|
||||
commands:
|
||||
# Only when vLLM model source is modified - test initialization of a large
|
||||
# subset of supported models (the complement of the small subset in the above
|
||||
# test.) Also run if model initialization test file is modified
|
||||
- pytest -v -s models/test_initialization.py -k 'not test_can_initialize_small_subset' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
|
||||
parallelism: 2
|
||||
|
||||
- label: Basic Models Tests (Other)
|
||||
timeout_in_minutes: 45
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/test_transformers.py
|
||||
- tests/models/test_registry.py
|
||||
commands:
|
||||
- pytest -v -s models/test_transformers.py models/test_registry.py
|
||||
|
||||
- label: Basic Models Test (Other CPU) # 5min
|
||||
timeout_in_minutes: 10
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/test_utils.py
|
||||
- tests/models/test_vision.py
|
||||
no_gpu: true
|
||||
commands:
|
||||
- pytest -v -s models/test_utils.py models/test_vision.py
|
||||
|
||||
- label: Transformers Nightly Models
|
||||
working_dir: "/vllm-workspace/"
|
||||
optional: true
|
||||
soft_fail: true
|
||||
commands:
|
||||
- pip install --upgrade git+https://github.com/huggingface/transformers
|
||||
- pytest -v -s tests/models/test_initialization.py
|
||||
- pytest -v -s tests/models/test_transformers.py
|
||||
- pytest -v -s tests/models/multimodal/processing/
|
||||
- pytest -v -s tests/models/multimodal/test_mapping.py
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||
# Whisper needs spawn method to avoid deadlock
|
||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||
@@ -1,22 +0,0 @@
|
||||
group: Models - Distributed
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Distributed Model Tests (2 GPUs)
|
||||
timeout_in_minutes: 50
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/model_loader/sharded_state_loader.py
|
||||
- vllm/model_executor/models/
|
||||
- tests/basic_correctness/
|
||||
- tests/model_executor/model_loader/test_sharded_state_loader.py
|
||||
- tests/models/
|
||||
commands:
|
||||
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
|
||||
# Avoid importing model tests that cause CUDA reinitialization error
|
||||
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
|
||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
|
||||
@@ -1,91 +0,0 @@
|
||||
group: Models - Language
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Language Models Tests (Standard)
|
||||
timeout_in_minutes: 25
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/language
|
||||
commands:
|
||||
# Test standard language models, excluding a subset of slow tests
|
||||
- pip freeze | grep -E 'torch'
|
||||
- pytest -v -s models/language -m 'core_model and (not slow_test)'
|
||||
|
||||
- label: Language Models Tests (Extra Standard) %N
|
||||
timeout_in_minutes: 45
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/models/
|
||||
- tests/models/language/pooling/test_embedding.py
|
||||
- tests/models/language/generation/test_common.py
|
||||
- tests/models/language/pooling/test_classification.py
|
||||
commands:
|
||||
# Shard slow subset of standard language models tests. Only run when model
|
||||
# source is modified, or when specified test files are modified
|
||||
- pip freeze | grep -E 'torch'
|
||||
- pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
|
||||
parallelism: 2
|
||||
|
||||
- label: Language Models Tests (Hybrid) %N
|
||||
timeout_in_minutes: 75
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/language/generation
|
||||
commands:
|
||||
# Install fast path packages for testing against transformers
|
||||
# Note: also needed to run plamo2 model in vLLM
|
||||
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
|
||||
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
||||
# Shard hybrid language model tests
|
||||
- pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
|
||||
parallelism: 2
|
||||
|
||||
- label: Language Models Test (Extended Generation) # 80min
|
||||
timeout_in_minutes: 110
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/language/generation
|
||||
commands:
|
||||
# Install fast path packages for testing against transformers
|
||||
# Note: also needed to run plamo2 model in vLLM
|
||||
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
|
||||
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
||||
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
|
||||
|
||||
- label: Language Models Test (PPL)
|
||||
timeout_in_minutes: 110
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/language/generation_ppl_test
|
||||
commands:
|
||||
- pytest -v -s models/language/generation_ppl_test
|
||||
|
||||
- label: Language Models Test (Extended Pooling) # 36min
|
||||
timeout_in_minutes: 50
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/language/pooling
|
||||
commands:
|
||||
- pytest -v -s models/language/pooling -m 'not core_model'
|
||||
|
||||
- label: Language Models Test (MTEB)
|
||||
timeout_in_minutes: 110
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/language/pooling_mteb_test
|
||||
commands:
|
||||
- pytest -v -s models/language/pooling_mteb_test
|
||||
@@ -1,79 +0,0 @@
|
||||
group: Models - Multimodal
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Multi-Modal Models (Standard) # 60min
|
||||
timeout_in_minutes: 80
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pip freeze | grep -E 'torch'
|
||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||
|
||||
- label: Multi-Modal Processor Test (CPU)
|
||||
timeout_in_minutes: 60
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
no_gpu: true
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
|
||||
|
||||
- label: Multi-Modal Processor # 44min
|
||||
timeout_in_minutes: 60
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/processing/test_tensor_schema.py
|
||||
|
||||
- label: Multi-Modal Accuracy Eval (Small Models) # 50min
|
||||
timeout_in_minutes: 70
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- vllm/multimodal/
|
||||
- vllm/inputs/
|
||||
- vllm/v1/core/
|
||||
commands:
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
||||
|
||||
- label: Multi-Modal Models (Extended) 1
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
|
||||
|
||||
- label: Multi-Modal Models (Extended) 2
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
|
||||
|
||||
- label: Multi-Modal Models (Extended) 3
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
|
||||
|
||||
# This test is used only in PR development phase to test individual models and should never run on main
|
||||
- label: Custom Models
|
||||
optional: true
|
||||
commands:
|
||||
- echo 'Testing custom models...'
|
||||
# PR authors can temporarily add commands below to test individual models
|
||||
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
|
||||
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
|
||||
@@ -1,34 +0,0 @@
|
||||
group: Plugins
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Plugin Tests (2 GPUs)
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- vllm/plugins/
|
||||
- tests/plugins/
|
||||
commands:
|
||||
# begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
|
||||
- pip install -e ./plugins/vllm_add_dummy_platform
|
||||
- pytest -v -s plugins_tests/test_platform_plugins.py
|
||||
- pip uninstall vllm_add_dummy_platform -y
|
||||
# end platform plugin tests
|
||||
# begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
|
||||
- pip install -e ./plugins/prithvi_io_processor_plugin
|
||||
- pytest -v -s plugins_tests/test_io_processor_plugins.py
|
||||
- pip uninstall prithvi_io_processor_plugin -y
|
||||
# end io_processor plugins test
|
||||
# begin stat_logger plugins test
|
||||
- pip install -e ./plugins/vllm_add_dummy_stat_logger
|
||||
- pytest -v -s plugins_tests/test_stats_logger_plugins.py
|
||||
- pip uninstall dummy_stat_logger -y
|
||||
# end stat_logger plugins test
|
||||
# other tests continue here:
|
||||
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
||||
- pip install -e ./plugins/vllm_add_dummy_model
|
||||
- pytest -v -s distributed/test_distributed_oot.py
|
||||
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
|
||||
@@ -1,52 +0,0 @@
|
||||
group: PyTorch
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: PyTorch Compilation Unit Tests
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
# Run unit tests defined directly under compile/,
|
||||
# not including subdirectories, which are usually heavier
|
||||
# tests covered elsewhere.
|
||||
# Use `find` to launch multiple instances of pytest so that
|
||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||
# However, find does not normally propagate error codes, so we combine it with xargs
|
||||
# (using -0 for proper path handling)
|
||||
- "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
|
||||
|
||||
- label: PyTorch Fullgraph Smoke Test
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
# Run smoke tests under fullgraph directory, except test_full_graph.py
|
||||
# as it is a heavy test that is covered in other steps.
|
||||
# Use `find` to launch multiple instances of pytest so that
|
||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"
|
||||
|
||||
- label: PyTorch Fullgraph
|
||||
timeout_in_minutes: 40
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
# fp8 kv scales not supported on sm89, tested on Blackwell instead
|
||||
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
||||
# Limit to no custom ops to reduce running time
|
||||
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
||||
|
||||
- label: Pytorch Nightly Dependency Override Check # 2min
|
||||
# if this test fails, it means the nightly torch version is not compatible with some
|
||||
# of the dependencies. Please check the error message and add the package to whitelist
|
||||
# in /vllm/tools/pre_commit/generate_nightly_torch_test.py
|
||||
soft_fail: true
|
||||
source_file_dependencies:
|
||||
- requirements/nightly_torch_test.txt
|
||||
commands:
|
||||
- bash standalone_tests/pytorch_nightly_dependency.sh
|
||||
@@ -1,46 +0,0 @@
|
||||
group: Quantization
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Quantization
|
||||
timeout_in_minutes: 90
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
- tests/quantization
|
||||
commands:
|
||||
# temporary install here since we need nightly, will move to requirements/test.in
|
||||
# after torchao 0.12 release, and pin a working version of torchao nightly here
|
||||
|
||||
# since torchao nightly is only compatible with torch nightly currently
|
||||
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
||||
# we can only upgrade after this is resolved
|
||||
# TODO(jerryzh168): resolve the above comment
|
||||
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
|
||||
- uv pip install --system conch-triton-kernels
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||
|
||||
- label: Quantized MoE Test (B200)
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
source_file_dependencies:
|
||||
- tests/quantization/test_blackwell_moe.py
|
||||
- vllm/model_executor/models/deepseek_v2.py
|
||||
- vllm/model_executor/models/gpt_oss.py
|
||||
- vllm/model_executor/models/llama4.py
|
||||
- vllm/model_executor/layers/fused_moe
|
||||
- vllm/model_executor/layers/quantization/compressed_tensors
|
||||
- vllm/model_executor/layers/quantization/modelopt.py
|
||||
- vllm/model_executor/layers/quantization/mxfp4.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
commands:
|
||||
- pytest -s -v tests/quantization/test_blackwell_moe.py
|
||||
|
||||
- label: Quantized Models Test
|
||||
timeout_in_minutes: 60
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/layers/quantization
|
||||
- tests/models/quantization
|
||||
commands:
|
||||
- pytest -v -s models/quantization
|
||||
@@ -1,14 +0,0 @@
|
||||
group: Samplers
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Samplers Test
|
||||
timeout_in_minutes: 75
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/layers
|
||||
- vllm/sampling_metadata.py
|
||||
- tests/samplers
|
||||
- tests/conftest.py
|
||||
commands:
|
||||
- pytest -v -s samplers
|
||||
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
||||
@@ -1,25 +0,0 @@
|
||||
group: Weight Loading
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Weight Loading Multiple GPU # 33min
|
||||
timeout_in_minutes: 45
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/weight_loading
|
||||
commands:
|
||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
|
||||
|
||||
- label: Weight Loading Multiple GPU - Large Models # optional
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
gpu: a100
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/weight_loading
|
||||
commands:
|
||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
||||
4
.github/CODEOWNERS
vendored
4
.github/CODEOWNERS
vendored
@@ -146,10 +146,10 @@ mkdocs.yaml @hmellor
|
||||
/requirements/kv_connectors.txt @NickLucche
|
||||
|
||||
# Pooling models
|
||||
/examples/pooling @noooop
|
||||
/examples/*/pooling/ @noooop
|
||||
/tests/models/*/pooling* @noooop
|
||||
/tests/entrypoints/pooling @noooop
|
||||
/vllm/entrypoints/pooling @noooop
|
||||
/vllm/entrypoints/pooling @aarnphm @chaunceyjiang @noooop
|
||||
/vllm/config/pooler.py @noooop
|
||||
/vllm/pooling_params.py @noooop
|
||||
/vllm/model_executor/layers/pooler.py @noooop
|
||||
|
||||
76
.github/mergify.yml
vendored
76
.github/mergify.yml
vendored
@@ -14,52 +14,6 @@ pull_request_rules:
|
||||
comment:
|
||||
message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
|
||||
|
||||
- name: comment-pre-commit-failure
|
||||
description: Comment on PR when pre-commit check fails
|
||||
conditions:
|
||||
- status-failure=pre-commit
|
||||
- -closed
|
||||
- -draft
|
||||
actions:
|
||||
comment:
|
||||
message: |
|
||||
Hi @{{author}}, the pre-commit checks have failed. Please run:
|
||||
|
||||
```bash
|
||||
uv pip install pre-commit
|
||||
pre-commit install
|
||||
pre-commit run --all-files
|
||||
```
|
||||
|
||||
Then, commit the changes and push to your branch.
|
||||
|
||||
For future commits, `pre-commit` will run automatically on changed files before each commit.
|
||||
|
||||
> [!TIP]
|
||||
> <details>
|
||||
> <summary>Is <code>mypy</code> or <code>markdownlint</code> failing?</summary>
|
||||
> <br/>
|
||||
> <code>mypy</code> and <code>markdownlint</code> are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally:
|
||||
>
|
||||
> ```bash
|
||||
> # For mypy (substitute "3.10" with the failing version if needed)
|
||||
> pre-commit run --hook-stage manual mypy-3.10
|
||||
> # For markdownlint
|
||||
> pre-commit run --hook-stage manual markdownlint
|
||||
> ```
|
||||
> </details>
|
||||
|
||||
- name: comment-dco-failure
|
||||
description: Comment on PR when DCO check fails
|
||||
conditions:
|
||||
- status-failure=dco
|
||||
- -closed
|
||||
- -draft
|
||||
actions:
|
||||
comment:
|
||||
message: |
|
||||
Hi @{{author}}, the DCO check has failed. Please click on DCO in the Checks section for instructions on how to resolve this.
|
||||
|
||||
- name: label-ci-build
|
||||
description: Automatically apply ci/build label
|
||||
conditions:
|
||||
@@ -186,7 +140,7 @@ pull_request_rules:
|
||||
- files~=^tests/entrypoints/test_context.py
|
||||
- files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
|
||||
- files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
|
||||
- files~=^vllm/entrypoints/openai/parser/harmony_utils.py
|
||||
- files~=^vllm/entrypoints/harmony_utils.py
|
||||
- files~=^vllm/entrypoints/tool_server.py
|
||||
- files~=^vllm/entrypoints/tool.py
|
||||
- files~=^vllm/entrypoints/context.py
|
||||
@@ -235,20 +189,6 @@ pull_request_rules:
|
||||
add:
|
||||
- rocm
|
||||
|
||||
- name: label-cpu
|
||||
description: Automatically apply cpu label
|
||||
conditions:
|
||||
- label != stale
|
||||
- files~=^(?!.*kv_offload)(?!.*cpu_offload).*\bcpu.*
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- cpu
|
||||
assign:
|
||||
users:
|
||||
- "fadara01"
|
||||
- "aditew01"
|
||||
|
||||
- name: label-structured-output
|
||||
description: Automatically apply structured-output label
|
||||
conditions:
|
||||
@@ -349,18 +289,6 @@ pull_request_rules:
|
||||
add:
|
||||
- tool-calling
|
||||
|
||||
- name: auto-rebase if approved, ready, and 40 commits behind main
|
||||
conditions:
|
||||
- base = main
|
||||
- label=ready
|
||||
- "#approved-reviews-by >= 1"
|
||||
- "#commits-behind >= 40"
|
||||
- -closed
|
||||
- -draft
|
||||
- -conflict
|
||||
actions:
|
||||
rebase: {}
|
||||
|
||||
- name: ping author on conflicts and add 'needs-rebase' label
|
||||
conditions:
|
||||
- label != stale
|
||||
@@ -430,4 +358,4 @@ pull_request_rules:
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- kv-connector
|
||||
- kv-connector
|
||||
4
.github/workflows/cleanup_pr_body.yml
vendored
4
.github/workflows/cleanup_pr_body.yml
vendored
@@ -13,10 +13,10 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
|
||||
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
|
||||
uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
|
||||
2
.github/workflows/macos-smoke-test.yml
vendored
2
.github/workflows/macos-smoke-test.yml
vendored
@@ -12,7 +12,7 @@ jobs:
|
||||
timeout-minutes: 30
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v6.0.1
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- uses: astral-sh/setup-uv@v7
|
||||
with:
|
||||
|
||||
4
.github/workflows/pre-commit.yml
vendored
4
.github/workflows/pre-commit.yml
vendored
@@ -16,8 +16,8 @@ jobs:
|
||||
pre-commit:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
|
||||
- uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
|
||||
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
|
||||
- uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
|
||||
with:
|
||||
python-version: "3.12"
|
||||
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
|
||||
|
||||
4
.github/workflows/stale.yml
vendored
4
.github/workflows/stale.yml
vendored
@@ -7,15 +7,13 @@ on:
|
||||
|
||||
jobs:
|
||||
close-issues-and-pull-requests:
|
||||
# Prevents triggering on forks or other repos
|
||||
if: github.repository == 'vllm-project/vllm'
|
||||
permissions:
|
||||
issues: write
|
||||
pull-requests: write
|
||||
actions: write
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1
|
||||
- uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
|
||||
with:
|
||||
# Increasing this value ensures that changes to this workflow
|
||||
# propagate to all issues and PRs in days rather than months
|
||||
|
||||
123
CMakeLists.txt
123
CMakeLists.txt
@@ -56,8 +56,8 @@ endif()
|
||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||
# versions are derived from docker/Dockerfile.rocm
|
||||
#
|
||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.9.1")
|
||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.9.1")
|
||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.9.0")
|
||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.9.0")
|
||||
|
||||
#
|
||||
# Try to find python package with an executable that exactly matches
|
||||
@@ -357,8 +357,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
|
||||
# marlin arches for fp16 output
|
||||
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
|
||||
# marlin has limited support for turing
|
||||
cuda_archs_loose_intersection(MARLIN_SM75_ARCHS "7.5" "${CUDA_ARCHS}")
|
||||
# marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX)
|
||||
cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
|
||||
# marlin arches for fp8 input
|
||||
@@ -366,10 +364,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
# - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
|
||||
# so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0)
|
||||
cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
|
||||
# marlin arches for other files
|
||||
cuda_archs_loose_intersection(MARLIN_OTHER_ARCHS "7.5;8.0+PTX" "${CUDA_ARCHS}")
|
||||
|
||||
if (MARLIN_OTHER_ARCHS)
|
||||
if (MARLIN_ARCHS)
|
||||
|
||||
#
|
||||
# For the Marlin kernels we automatically generate sources for various
|
||||
@@ -388,7 +384,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_COMMAND} -E env
|
||||
PYTHONPATH=$ENV{PYTHONPATH}
|
||||
PYTHONPATH=$PYTHONPATH
|
||||
${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
|
||||
RESULT_VARIABLE marlin_generation_result
|
||||
OUTPUT_VARIABLE marlin_generation_result
|
||||
@@ -410,39 +406,25 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
message(STATUS "Marlin generation script has not changed, skipping generation.")
|
||||
endif()
|
||||
|
||||
if (MARLIN_ARCHS)
|
||||
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
||||
set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
|
||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
||||
endif()
|
||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
|
||||
|
||||
file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
||||
set_source_files_properties(${MARLIN_TEMPLATE_BF16_KERNEL_SRC}
|
||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
||||
endif()
|
||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
|
||||
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
||||
set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
|
||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
||||
endif()
|
||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
|
||||
|
||||
if (MARLIN_SM75_ARCHS)
|
||||
file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/gptq_marlin/sm75_kernel_*.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_TEMPLATE_SM75_KERNEL_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_SM75_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
||||
set_source_files_properties(${MARLIN_TEMPLATE_SM75_KERNEL_SRC}
|
||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
||||
endif()
|
||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_SM75_KERNEL_SRC})
|
||||
file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
||||
set_source_files_properties(${MARLIN_TEMPLATE_BF16_KERNEL_SRC}
|
||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
||||
endif()
|
||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
|
||||
|
||||
if (MARLIN_FP8_ARCHS)
|
||||
file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
|
||||
@@ -464,14 +446,14 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_SRCS}"
|
||||
CUDA_ARCHS "${MARLIN_OTHER_ARCHS}")
|
||||
CUDA_ARCHS "${MARLIN_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
||||
set_source_files_properties(${MARLIN_SRCS}
|
||||
set_source_files_properties("csrc/quantization/gptq_marlin/gptq_marlin.cu"
|
||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
||||
endif()
|
||||
list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
|
||||
|
||||
message(STATUS "Building Marlin kernels for archs: ${MARLIN_OTHER_ARCHS}")
|
||||
message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
|
||||
else()
|
||||
message(STATUS "Not building Marlin kernels as no compatible archs found"
|
||||
" in CUDA target architectures")
|
||||
@@ -840,7 +822,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_COMMAND} -E env
|
||||
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$ENV{PYTHONPATH}
|
||||
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
|
||||
${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
|
||||
RESULT_VARIABLE machete_generation_result
|
||||
OUTPUT_VARIABLE machete_generation_output
|
||||
@@ -892,10 +874,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu"
|
||||
"csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu"
|
||||
"csrc/quantization/cutlass_w4a8/w4a8_utils.cu"
|
||||
)
|
||||
"csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu")
|
||||
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
@@ -965,6 +944,7 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
|
||||
set(VLLM_MOE_EXT_SRC
|
||||
"csrc/moe/torch_bindings.cpp"
|
||||
"csrc/moe/moe_align_sum_kernels.cu"
|
||||
"csrc/moe/moe_lora_align_sum_kernels.cu"
|
||||
"csrc/moe/topk_softmax_kernels.cu")
|
||||
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
@@ -998,16 +978,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
# note that we always set `use_atomic_add=False` for moe marlin now,
|
||||
# so we don't need 9.0 for bf16 atomicAdd PTX
|
||||
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
|
||||
# moe marlin has limited support for turing
|
||||
cuda_archs_loose_intersection(MARLIN_MOE_SM75_ARCHS "7.5" "${CUDA_ARCHS}")
|
||||
# moe marlin arches for fp8 input
|
||||
# - sm80 doesn't support fp8 computation
|
||||
# - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
|
||||
# so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0)
|
||||
cuda_archs_loose_intersection(MARLIN_MOE_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
|
||||
# moe marlin arches for other files
|
||||
cuda_archs_loose_intersection(MARLIN_MOE_OTHER_ARCHS "7.5;8.0+PTX" "${CUDA_ARCHS}")
|
||||
if (MARLIN_MOE_OTHER_ARCHS)
|
||||
if (MARLIN_MOE_ARCHS)
|
||||
|
||||
#
|
||||
# For the Marlin MOE kernels we automatically generate sources for various
|
||||
@@ -1026,7 +1002,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_COMMAND} -E env
|
||||
PYTHONPATH=$ENV{PYTHONPATH}
|
||||
PYTHONPATH=$PYTHONPATH
|
||||
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
|
||||
RESULT_VARIABLE moe_marlin_generation_result
|
||||
OUTPUT_VARIABLE moe_marlin_generation_output
|
||||
@@ -1048,29 +1024,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
|
||||
endif()
|
||||
|
||||
if (MARLIN_MOE_ARCHS)
|
||||
file(GLOB MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_MOE_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
||||
set_source_files_properties(${MARLIN_MOE_SRC}
|
||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
||||
endif()
|
||||
list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
|
||||
endif()
|
||||
|
||||
if (MARLIN_MOE_SM75_ARCHS)
|
||||
file(GLOB MARLIN_MOE_SM75_SRC "csrc/moe/marlin_moe_wna16/sm75_kernel_*.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_MOE_SM75_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_MOE_SM75_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
||||
set_source_files_properties(${MARLIN_MOE_SM75_SRC}
|
||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
||||
endif()
|
||||
list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SM75_SRC})
|
||||
file(GLOB MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu")
|
||||
list(APPEND MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/ops.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_MOE_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
||||
set_source_files_properties(${MARLIN_MOE_SRC}
|
||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
||||
endif()
|
||||
list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
|
||||
|
||||
if (MARLIN_MOE_FP8_ARCHS)
|
||||
file(GLOB MARLIN_MOE_FP8_SRC "csrc/moe/marlin_moe_wna16/sm89_kernel_*.cu")
|
||||
@@ -1084,17 +1047,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_FP8_SRC})
|
||||
endif()
|
||||
|
||||
set(MARLIN_MOE_OTHER_SRC "csrc/moe/marlin_moe_wna16/ops.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_MOE_OTHER_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_MOE_OTHER_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
||||
set_source_files_properties(${MARLIN_MOE_OTHER_SRC}
|
||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
||||
endif()
|
||||
list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_OTHER_SRC}")
|
||||
|
||||
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_OTHER_ARCHS}")
|
||||
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
|
||||
else()
|
||||
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
|
||||
" in CUDA target architectures")
|
||||
|
||||
@@ -137,19 +137,16 @@ Compute Resources:
|
||||
- Alibaba Cloud
|
||||
- AMD
|
||||
- Anyscale
|
||||
- Arm
|
||||
- AWS
|
||||
- Crusoe Cloud
|
||||
- Databricks
|
||||
- DeepInfra
|
||||
- Google Cloud
|
||||
- IBM
|
||||
- Intel
|
||||
- Lambda Lab
|
||||
- Nebius
|
||||
- Novita AI
|
||||
- NVIDIA
|
||||
- Red Hat
|
||||
- Replicate
|
||||
- Roblox
|
||||
- RunPod
|
||||
|
||||
@@ -18,11 +18,6 @@ MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
|
||||
MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
|
||||
NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
|
||||
NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
|
||||
HOSTNAME=$(hostname)
|
||||
if [[ -z "$HOSTNAME" ]]; then
|
||||
echo "Error: Failed to determine hostname." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
|
||||
RESULT="$LOG_FOLDER/result.txt"
|
||||
@@ -87,7 +82,6 @@ start_server() {
|
||||
"$MODEL"
|
||||
"--disable-log-requests"
|
||||
"--port" "8004"
|
||||
"--host" "$HOSTNAME"
|
||||
"--gpu-memory-utilization" "$gpu_memory_utilization"
|
||||
"--max-num-seqs" "$max_num_seqs"
|
||||
"--max-num-batched-tokens" "$max_num_batched_tokens"
|
||||
@@ -102,9 +96,8 @@ start_server() {
|
||||
# This correctly passes each element as a separate argument.
|
||||
if [[ -n "$profile_dir" ]]; then
|
||||
# Start server with profiling enabled
|
||||
local profile_config_json="{\"profiler\": \"torch\", \"torch_profiler_dir\": \"$profile_dir\"}"
|
||||
VLLM_SERVER_DEV_MODE=1 \
|
||||
vllm serve --profiler-config "$profile_config_json" "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
||||
VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
|
||||
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
||||
else
|
||||
# Start server without profiling
|
||||
VLLM_SERVER_DEV_MODE=1 \
|
||||
@@ -119,7 +112,7 @@ start_server() {
|
||||
# since that we should always have permission to send signal to the server process.
|
||||
kill -0 $server_pid 2> /dev/null || break
|
||||
|
||||
RESPONSE=$(curl -s -X GET "http://${HOSTNAME}:8004/health" -w "%{http_code}" -o /dev/stdout)
|
||||
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
|
||||
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
|
||||
if [[ "$STATUS_CODE" -eq 200 ]]; then
|
||||
server_started=1
|
||||
@@ -179,7 +172,6 @@ run_benchmark() {
|
||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||
--num-prompts 1000 \
|
||||
--random-prefix-len $prefix_len \
|
||||
--host "$HOSTNAME" \
|
||||
--port 8004 &> "$bm_log"
|
||||
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
||||
@@ -195,7 +187,7 @@ run_benchmark() {
|
||||
request_rate=$((${throughput%.*} + 1))
|
||||
while ((request_rate > 0)); do
|
||||
# clear prefix cache
|
||||
curl -X POST http://${HOSTNAME}:8004/reset_prefix_cache
|
||||
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
|
||||
sleep 5
|
||||
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
|
||||
vllm bench serve \
|
||||
@@ -211,7 +203,6 @@ run_benchmark() {
|
||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||
--num-prompts 100 \
|
||||
--random-prefix-len $prefix_len \
|
||||
--host "$HOSTNAME" \
|
||||
--port 8004 &> "$bm_log"
|
||||
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
||||
@@ -312,7 +303,6 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
|
||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||
--num-prompts 100 \
|
||||
--random-prefix-len $prefix_len \
|
||||
--host "$HOSTNAME" \
|
||||
--port 8004 \
|
||||
--profile &> "$bm_log"
|
||||
else
|
||||
|
||||
@@ -620,7 +620,7 @@ def get_tokenizer(
|
||||
kwargs["use_fast"] = False
|
||||
if tokenizer_mode == "mistral":
|
||||
try:
|
||||
from vllm.tokenizers.mistral import MistralTokenizer
|
||||
from vllm.tokenizers import MistralTokenizer
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"MistralTokenizer requires vllm package.\n"
|
||||
|
||||
@@ -1,120 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Micro benchmark comparing built-in hash(), SHA-256, and xxHash.
|
||||
|
||||
This focuses on a single test payload shaped like the prefix-cache hash input:
|
||||
(32-byte bytes object, 32-int tuple)
|
||||
|
||||
Usage:
|
||||
python benchmarks/hash_micro_benchmark.py --iterations 20000
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import random
|
||||
import statistics
|
||||
import time
|
||||
from collections.abc import Callable, Iterable
|
||||
|
||||
from vllm.utils.hashing import sha256, xxhash
|
||||
|
||||
|
||||
def _generate_test_data(seed: int) -> tuple[bytes, tuple[int, ...]]:
|
||||
"""Generate a deterministic test payload."""
|
||||
random.seed(seed)
|
||||
bytes_data = bytes(random.getrandbits(8) for _ in range(32))
|
||||
int_tuple = tuple(random.randint(1, 1_000_000) for _ in range(32))
|
||||
return (bytes_data, int_tuple)
|
||||
|
||||
|
||||
def _benchmark_func(func: Callable[[tuple], object], data: tuple, iterations: int):
|
||||
"""Return (avg_seconds, std_seconds) for hashing `data` `iterations` times."""
|
||||
times: list[float] = []
|
||||
|
||||
# Warm-up to avoid first-run noise.
|
||||
for _ in range(200):
|
||||
func(data)
|
||||
|
||||
for _ in range(iterations):
|
||||
start = time.perf_counter()
|
||||
func(data)
|
||||
end = time.perf_counter()
|
||||
times.append(end - start)
|
||||
|
||||
avg = statistics.mean(times)
|
||||
std = statistics.stdev(times) if len(times) > 1 else 0.0
|
||||
return avg, std
|
||||
|
||||
|
||||
def _run_benchmarks(
|
||||
benchmarks: Iterable[tuple[str, Callable[[tuple], object]]],
|
||||
data: tuple,
|
||||
iterations: int,
|
||||
):
|
||||
"""Yield (name, avg, std) for each benchmark, skipping unavailable ones."""
|
||||
for name, func in benchmarks:
|
||||
try:
|
||||
avg, std = _benchmark_func(func, data, iterations)
|
||||
except ModuleNotFoundError as exc:
|
||||
print(f"Skipping {name}: {exc}")
|
||||
continue
|
||||
yield name, avg, std
|
||||
|
||||
|
||||
def builtin_hash(data: tuple) -> int:
|
||||
"""Wrapper for Python's built-in hash()."""
|
||||
return hash(data)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--iterations",
|
||||
type=int,
|
||||
default=10_000,
|
||||
help="Number of measured iterations per hash function.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--seed", type=int, default=42, help="Random seed for test payload."
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
data = _generate_test_data(args.seed)
|
||||
benchmarks = (
|
||||
("SHA256 (pickle)", sha256),
|
||||
("xxHash (pickle)", xxhash),
|
||||
("built-in hash()", builtin_hash),
|
||||
)
|
||||
|
||||
print("=" * 60)
|
||||
print("HASH FUNCTION MICRO BENCHMARK")
|
||||
print("=" * 60)
|
||||
print("Test data: (32-byte bytes object, 32-int tuple)")
|
||||
print(f"Iterations: {args.iterations:,}")
|
||||
print("=" * 60)
|
||||
|
||||
results = list(_run_benchmarks(benchmarks, data, args.iterations))
|
||||
builtin_entry = next((r for r in results if r[0] == "built-in hash()"), None)
|
||||
|
||||
print("\nResults:")
|
||||
for name, avg, std in results:
|
||||
print(f" {name:16s}: {avg * 1e6:8.2f} ± {std * 1e6:6.2f} μs")
|
||||
|
||||
if builtin_entry:
|
||||
_, builtin_avg, _ = builtin_entry
|
||||
print("\n" + "=" * 60)
|
||||
print("SUMMARY (relative to built-in hash())")
|
||||
print("=" * 60)
|
||||
for name, avg, _ in results:
|
||||
if name == "built-in hash()":
|
||||
continue
|
||||
speed_ratio = avg / builtin_avg
|
||||
print(f"• {name} is {speed_ratio:.1f}x slower than built-in hash()")
|
||||
else:
|
||||
print("\nBuilt-in hash() result missing; cannot compute speed ratios.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -32,11 +32,12 @@ def benchmark_propose(args):
|
||||
|
||||
model_config = ModelConfig(
|
||||
model="facebook/opt-125m",
|
||||
task="generate",
|
||||
max_model_len=args.num_token + args.num_spec_token,
|
||||
tokenizer="facebook/opt-125m",
|
||||
tokenizer_mode="auto",
|
||||
dtype="auto",
|
||||
seed=0,
|
||||
seed=None,
|
||||
trust_remote_code=False,
|
||||
)
|
||||
proposer = NgramProposer(
|
||||
|
||||
@@ -1,110 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
"""
|
||||
Simple benchmark to compare prefix-cache block hashing algorithms.
|
||||
|
||||
Example:
|
||||
python benchmark_prefix_block_hash.py --num-blocks 20000 --block-size 32
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import random
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from collections.abc import Callable, Iterable, Sequence
|
||||
|
||||
from vllm.utils.hashing import get_hash_fn_by_name
|
||||
from vllm.v1.core.kv_cache_utils import BlockHash, hash_block_tokens, init_none_hash
|
||||
|
||||
SUPPORTED_ALGOS = ("sha256", "sha256_cbor", "xxhash", "xxhash_cbor")
|
||||
|
||||
|
||||
def _generate_blocks(
|
||||
num_blocks: int, block_size: int, vocab_size: int, seed: int
|
||||
) -> list[list[int]]:
|
||||
rng = random.Random(seed)
|
||||
return [
|
||||
[rng.randrange(vocab_size) for _ in range(block_size)]
|
||||
for _ in range(num_blocks)
|
||||
]
|
||||
|
||||
|
||||
def _hash_all_blocks(
|
||||
hash_fn: Callable[[object], bytes],
|
||||
blocks: Iterable[Sequence[int]],
|
||||
) -> float:
|
||||
parent_hash: BlockHash | None = None
|
||||
start = time.perf_counter()
|
||||
for block in blocks:
|
||||
parent_hash = hash_block_tokens(hash_fn, parent_hash, block, extra_keys=None)
|
||||
end = time.perf_counter()
|
||||
return end - start
|
||||
|
||||
|
||||
def _benchmark(
|
||||
hash_algo: str,
|
||||
blocks: list[list[int]],
|
||||
trials: int,
|
||||
) -> tuple[float, float, float] | None:
|
||||
try:
|
||||
hash_fn = get_hash_fn_by_name(hash_algo)
|
||||
init_none_hash(hash_fn)
|
||||
timings = [_hash_all_blocks(hash_fn, blocks) for _ in range(trials)]
|
||||
except ModuleNotFoundError as exc:
|
||||
print(f"Skipping {hash_algo}: {exc}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
avg = statistics.mean(timings)
|
||||
best = min(timings)
|
||||
# throughput: tokens / second
|
||||
tokens_hashed = len(blocks) * len(blocks[0])
|
||||
throughput = tokens_hashed / best
|
||||
return avg, best, throughput
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--num-blocks", type=int, default=10000, help="Block count.")
|
||||
parser.add_argument("--block-size", type=int, default=32, help="Tokens per block.")
|
||||
parser.add_argument(
|
||||
"--vocab-size", type=int, default=32000, help="Token id range [0, vocab_size)."
|
||||
)
|
||||
parser.add_argument("--seed", type=int, default=0, help="Random seed.")
|
||||
parser.add_argument(
|
||||
"--trials", type=int, default=5, help="Number of timed trials per algorithm."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--algorithms",
|
||||
nargs="+",
|
||||
default=SUPPORTED_ALGOS,
|
||||
choices=SUPPORTED_ALGOS,
|
||||
help="Hash algorithms to benchmark.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
blocks = _generate_blocks(
|
||||
args.num_blocks, args.block_size, args.vocab_size, args.seed
|
||||
)
|
||||
print(
|
||||
f"Benchmarking {len(args.algorithms)} algorithms on "
|
||||
f"{args.num_blocks} blocks (block size={args.block_size})."
|
||||
)
|
||||
|
||||
for algo in args.algorithms:
|
||||
result = _benchmark(algo, blocks, args.trials)
|
||||
if result is None:
|
||||
continue
|
||||
|
||||
avg, best, throughput = result
|
||||
print(
|
||||
f"{algo:14s} avg: {avg:.6f}s best: {best:.6f}s "
|
||||
f"throughput: {throughput / 1e6:.2f}M tokens/s"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -574,7 +574,7 @@ async def benchmark(
|
||||
)
|
||||
print(
|
||||
"{:<40} {:<10.2f}".format(
|
||||
"Total token throughput (tok/s):", metrics.total_token_throughput
|
||||
"Total Token throughput (tok/s):", metrics.total_token_throughput
|
||||
)
|
||||
)
|
||||
|
||||
@@ -963,7 +963,8 @@ def create_argument_parser():
|
||||
parser.add_argument(
|
||||
"--profile",
|
||||
action="store_true",
|
||||
help="Use vLLM Profiling. --profiler-config must be provided on the server.",
|
||||
help="Use Torch Profiler. The endpoint must be launched with "
|
||||
"VLLM_TORCH_PROFILER_DIR to enable profiler.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--result-dir",
|
||||
|
||||
@@ -14,9 +14,6 @@ from tqdm import tqdm
|
||||
|
||||
import vllm._custom_ops as ops
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
per_token_group_quant_fp8,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -25,7 +22,6 @@ class bench_params_t:
|
||||
hidden_size: int
|
||||
add_residual: bool
|
||||
dtype: torch.dtype
|
||||
group_size: list[int]
|
||||
|
||||
def description(self):
|
||||
return (
|
||||
@@ -33,7 +29,6 @@ class bench_params_t:
|
||||
f"x D {self.hidden_size} "
|
||||
f"x R {self.add_residual} "
|
||||
f"x DT {self.dtype}"
|
||||
f"x GS {self.group_size}"
|
||||
)
|
||||
|
||||
|
||||
@@ -43,11 +38,10 @@ def get_bench_params() -> list[bench_params_t]:
|
||||
HIDDEN_SIZES = list(range(1024, 8129, 1024))
|
||||
ADD_RESIDUAL = [True, False]
|
||||
DTYPES = [torch.bfloat16, torch.float]
|
||||
GROUP_SIZES = [[1, 64], [1, 128]]
|
||||
|
||||
combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES, GROUP_SIZES)
|
||||
combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES)
|
||||
bench_params = list(
|
||||
map(lambda x: bench_params_t(x[0], x[1], x[2], x[3], x[4]), combinations)
|
||||
map(lambda x: bench_params_t(x[0], x[1], x[2], x[3]), combinations)
|
||||
)
|
||||
return bench_params
|
||||
|
||||
@@ -58,7 +52,6 @@ def unfused_int8_impl(
|
||||
x: torch.Tensor,
|
||||
residual: torch.Tensor | None,
|
||||
quant_dtype: torch.dtype,
|
||||
group_size: list[int],
|
||||
):
|
||||
# Norm
|
||||
torch_out = None
|
||||
@@ -76,7 +69,6 @@ def unfused_fp8_impl(
|
||||
x: torch.Tensor,
|
||||
residual: torch.Tensor | None,
|
||||
quant_dtype: torch.dtype,
|
||||
group_size: list[int],
|
||||
):
|
||||
# Norm
|
||||
torch_out = None
|
||||
@@ -89,63 +81,23 @@ def unfused_fp8_impl(
|
||||
torch_out, _ = ops.scaled_fp8_quant(torch_out)
|
||||
|
||||
|
||||
def unfused_groupwise_fp8_impl(
|
||||
rms_norm_layer: RMSNorm,
|
||||
x: torch.Tensor,
|
||||
residual: torch.Tensor | None,
|
||||
quant_dtype: torch.dtype,
|
||||
group_size: list[int],
|
||||
):
|
||||
# Norm
|
||||
torch_out = None
|
||||
if residual is None:
|
||||
torch_out = rms_norm_layer.forward_cuda(x, residual)
|
||||
else:
|
||||
torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
|
||||
|
||||
# Quant
|
||||
torch_out, _ = per_token_group_quant_fp8(
|
||||
torch_out, group_size=group_size[1], use_ue8m0=False
|
||||
)
|
||||
|
||||
|
||||
def fused_impl(
|
||||
rms_norm_layer: RMSNorm, # this stores the weights
|
||||
x: torch.Tensor,
|
||||
residual: torch.Tensor | None,
|
||||
quant_dtype: torch.dtype,
|
||||
group_size: list[int],
|
||||
):
|
||||
out, _ = ops.rms_norm_dynamic_per_token_quant(
|
||||
x, rms_norm_layer.weight, 1e-6, quant_dtype, residual=residual
|
||||
)
|
||||
|
||||
|
||||
def fused_groupwise_impl(
|
||||
rms_norm_layer: RMSNorm, # this stores the weights
|
||||
x: torch.Tensor,
|
||||
residual: torch.Tensor | None,
|
||||
quant_dtype: torch.dtype,
|
||||
group_size: list[int],
|
||||
):
|
||||
out, _ = ops.rms_norm_per_block_quant(
|
||||
x,
|
||||
rms_norm_layer.weight,
|
||||
1e-6,
|
||||
quant_dtype,
|
||||
group_size,
|
||||
residual=residual,
|
||||
is_scale_transposed=True,
|
||||
)
|
||||
|
||||
|
||||
# Bench functions
|
||||
def bench_fn(
|
||||
rms_norm_layer: RMSNorm,
|
||||
x: torch.Tensor,
|
||||
residual: torch.Tensor,
|
||||
quant_dtype: torch.dtype,
|
||||
group_size: list[int],
|
||||
label: str,
|
||||
sub_label: str,
|
||||
fn: Callable,
|
||||
@@ -158,11 +110,10 @@ def bench_fn(
|
||||
"x": x,
|
||||
"residual": residual,
|
||||
"quant_dtype": quant_dtype,
|
||||
"group_size": group_size,
|
||||
"fn": fn,
|
||||
}
|
||||
return TBenchmark.Timer(
|
||||
stmt="fn(rms_norm_layer, x, residual, quant_dtype, group_size)",
|
||||
stmt="fn(rms_norm_layer, x, residual, quant_dtype)",
|
||||
globals=globals,
|
||||
label=label,
|
||||
sub_label=sub_label,
|
||||
@@ -196,7 +147,6 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
|
||||
x,
|
||||
residual,
|
||||
torch.int8,
|
||||
params.group_size,
|
||||
label,
|
||||
sub_label,
|
||||
unfused_int8_impl,
|
||||
@@ -211,7 +161,6 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
|
||||
x,
|
||||
residual,
|
||||
torch.float8_e4m3fn,
|
||||
params.group_size,
|
||||
label,
|
||||
sub_label,
|
||||
unfused_fp8_impl,
|
||||
@@ -226,7 +175,6 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
|
||||
x,
|
||||
residual,
|
||||
torch.int8,
|
||||
params.group_size,
|
||||
label,
|
||||
sub_label,
|
||||
fused_impl,
|
||||
@@ -241,7 +189,6 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
|
||||
x,
|
||||
residual,
|
||||
torch.float8_e4m3fn,
|
||||
params.group_size,
|
||||
label,
|
||||
sub_label,
|
||||
fused_impl,
|
||||
@@ -249,36 +196,6 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
|
||||
)
|
||||
)
|
||||
|
||||
# unfused groupwise fp8 impl.
|
||||
timers.append(
|
||||
bench_fn(
|
||||
layer,
|
||||
x,
|
||||
residual,
|
||||
torch.float8_e4m3fn,
|
||||
params.group_size,
|
||||
label,
|
||||
sub_label,
|
||||
unfused_groupwise_fp8_impl,
|
||||
"unfused_groupwise_fp8_impl",
|
||||
)
|
||||
)
|
||||
|
||||
# fused groupwise fp8 impl.
|
||||
timers.append(
|
||||
bench_fn(
|
||||
layer,
|
||||
x,
|
||||
residual,
|
||||
torch.float8_e4m3fn,
|
||||
params.group_size,
|
||||
label,
|
||||
sub_label,
|
||||
fused_groupwise_impl,
|
||||
"fused_groupwise_fp8_impl",
|
||||
)
|
||||
)
|
||||
|
||||
print_timers(timers)
|
||||
|
||||
return timers
|
||||
|
||||
@@ -1,244 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from itertools import product
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
import torch.utils.benchmark as TBenchmark
|
||||
from torch.utils.benchmark import Measurement as TMeasurement
|
||||
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
_per_token_group_quant_fp8_colmajor,
|
||||
silu_mul_per_token_group_quant_fp8_colmajor,
|
||||
)
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
|
||||
|
||||
from .utils import ArgPool, Bench, CudaGraphBenchParams
|
||||
|
||||
GROUP_SIZE = 128
|
||||
FLOAT8_T = torch.float8_e4m3fn
|
||||
|
||||
|
||||
def print_timers(timers: list[TMeasurement], cuda_graph_nops: int):
|
||||
print(
|
||||
f"Note : The timings reported above is for {cuda_graph_nops} "
|
||||
"consecutive invocations of the benchmarking functions. "
|
||||
f"Please divide by {cuda_graph_nops} for single invocation "
|
||||
"timings."
|
||||
)
|
||||
compare = TBenchmark.Compare(timers)
|
||||
compare.print()
|
||||
|
||||
|
||||
class ImplType(Enum):
|
||||
SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR = 1
|
||||
REFERENCE = 2
|
||||
|
||||
def get_impl(self):
|
||||
if self == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR:
|
||||
return silu_mul_per_token_group_quant_fp8_colmajor
|
||||
elif self == ImplType.REFERENCE:
|
||||
return reference
|
||||
raise ValueError(f"Unrecognized ImplType {self}")
|
||||
|
||||
|
||||
@dataclass
|
||||
class BenchmarkTensors:
|
||||
input: torch.Tensor
|
||||
output: torch.Tensor
|
||||
|
||||
# Reference act output tensor
|
||||
ref_act_out: torch.Tensor
|
||||
ref_quant_out: torch.Tensor
|
||||
|
||||
@staticmethod
|
||||
def make(T: int, N: int) -> "BenchmarkTensors":
|
||||
assert T % GROUP_SIZE == 0
|
||||
assert N % (GROUP_SIZE * 2) == 0
|
||||
|
||||
input = torch.rand((T, N), dtype=torch.bfloat16, device="cuda")
|
||||
|
||||
# silu_mul_per_token_group_quant_fp8_colmajor output.
|
||||
output = torch.rand((T, N // 2), dtype=torch.bfloat16, device="cuda").to(
|
||||
FLOAT8_T
|
||||
)
|
||||
|
||||
# reference output.
|
||||
ref_act_out = torch.empty((T, N // 2), dtype=torch.bfloat16, device="cuda")
|
||||
ref_quant_out = torch.empty(
|
||||
(T, N // 2), dtype=torch.bfloat16, device="cuda"
|
||||
).to(FLOAT8_T)
|
||||
|
||||
return BenchmarkTensors(
|
||||
input=input,
|
||||
output=output,
|
||||
ref_act_out=ref_act_out,
|
||||
ref_quant_out=ref_quant_out,
|
||||
)
|
||||
|
||||
@property
|
||||
def T(self):
|
||||
return self.input.size(0)
|
||||
|
||||
@property
|
||||
def N(self):
|
||||
return self.input.size(1)
|
||||
|
||||
def make_impl_kwargs(self, impl_type: ImplType) -> dict[str, Any]:
|
||||
if impl_type == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR:
|
||||
return {
|
||||
"input": self.input,
|
||||
"output": self.output,
|
||||
"use_ue8m0": is_deep_gemm_e8m0_used(),
|
||||
}
|
||||
elif impl_type == ImplType.REFERENCE:
|
||||
return {
|
||||
"input": self.input,
|
||||
"act_out": self.ref_act_out,
|
||||
"quant_out": self.ref_quant_out,
|
||||
"use_ue8m0": is_deep_gemm_e8m0_used(),
|
||||
}
|
||||
raise ValueError(f"Unrecognized impl_type {impl_type}")
|
||||
|
||||
|
||||
def reference_quant(x: torch.Tensor, quant_out: torch.Tensor, use_ue8m0: bool):
|
||||
"""
|
||||
Reference triton quant kernel from,
|
||||
vllm.model_executor.layers.quantization.utils.fp8_utils
|
||||
"""
|
||||
assert quant_out.size() == x.size()
|
||||
# Allocate the scale tensor column-major format.
|
||||
shape = (x.shape[-1] // GROUP_SIZE,) + x.shape[:-1]
|
||||
x_q = quant_out
|
||||
x_s = torch.empty(shape, device=x.device, dtype=torch.float32).permute(-1, -2)
|
||||
|
||||
M = x.numel() // GROUP_SIZE
|
||||
N = GROUP_SIZE
|
||||
BLOCK = triton.next_power_of_2(N)
|
||||
# heuristics for number of warps
|
||||
num_warps = min(max(BLOCK // 256, 1), 8)
|
||||
num_stages = 1
|
||||
|
||||
finfo = torch.finfo(FLOAT8_T)
|
||||
fp8_min = finfo.min
|
||||
fp8_max = finfo.max
|
||||
|
||||
_per_token_group_quant_fp8_colmajor[(M,)](
|
||||
x,
|
||||
x_q,
|
||||
x_s,
|
||||
GROUP_SIZE,
|
||||
x.shape[1],
|
||||
x.stride(0),
|
||||
x_s.stride(1),
|
||||
eps=1e-10,
|
||||
fp8_min=fp8_min,
|
||||
fp8_max=fp8_max,
|
||||
use_ue8m0=use_ue8m0,
|
||||
BLOCK=BLOCK,
|
||||
num_warps=num_warps,
|
||||
num_stages=num_stages,
|
||||
)
|
||||
return x_q, x_s
|
||||
|
||||
|
||||
def reference(
|
||||
input: torch.Tensor,
|
||||
act_out: torch.Tensor,
|
||||
quant_out: torch.Tensor,
|
||||
use_ue8m0: bool,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
torch.ops._C.silu_and_mul(act_out, input)
|
||||
return reference_quant(act_out, quant_out, use_ue8m0)
|
||||
|
||||
|
||||
def bench_impl(
|
||||
bench_tensors: list[BenchmarkTensors], impl_type: ImplType
|
||||
) -> TMeasurement:
|
||||
T = bench_tensors[0].T
|
||||
N = bench_tensors[0].N
|
||||
|
||||
arg_pool_size = len(bench_tensors)
|
||||
kwargs_list = [bt.make_impl_kwargs(impl_type) for bt in bench_tensors]
|
||||
|
||||
# warmup
|
||||
for kwargs in kwargs_list:
|
||||
impl_type.get_impl()(**kwargs)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
# Merge into a single kwargs and qualify arguments as ArgPool
|
||||
kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
|
||||
for _kwargs in kwargs_list:
|
||||
for k, v in _kwargs.items():
|
||||
kwargs[k].values.append(v)
|
||||
|
||||
cuda_graph_params = None
|
||||
cuda_graph_params = CudaGraphBenchParams(arg_pool_size)
|
||||
timer = None
|
||||
with Bench(
|
||||
cuda_graph_params,
|
||||
"silu-mul-quant",
|
||||
f"num_tokens={T}, N={N}",
|
||||
impl_type.name,
|
||||
impl_type.get_impl(),
|
||||
**kwargs,
|
||||
) as bench:
|
||||
timer = bench.run()
|
||||
return timer
|
||||
|
||||
|
||||
def test_correctness(T: int, N: int):
|
||||
print(f"Testing num_tokens={T}, N={N} ...")
|
||||
|
||||
bench_tensor = BenchmarkTensors.make(T, N)
|
||||
|
||||
def output_from_impl(impl: ImplType) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
return impl.get_impl()(**bench_tensor.make_impl_kwargs(impl))
|
||||
|
||||
# reference output
|
||||
ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE)
|
||||
|
||||
# test ouptut
|
||||
out_q, out_s = output_from_impl(
|
||||
ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
|
||||
)
|
||||
|
||||
torch.testing.assert_close(ref_out_q.to(torch.float32), out_q.to(torch.float32))
|
||||
torch.testing.assert_close(ref_out_s, out_s)
|
||||
|
||||
|
||||
def run(Ts: list[int], Ns: list[int], arg_pool_size: int) -> list[TMeasurement]:
|
||||
timers = []
|
||||
for N, T in product(Ns, Ts):
|
||||
test_correctness(T, N)
|
||||
|
||||
bench_tensors: list[BenchmarkTensors] = [
|
||||
BenchmarkTensors.make(T, N) for _ in range(arg_pool_size)
|
||||
]
|
||||
|
||||
silu_mul_quant_timer = bench_impl(
|
||||
bench_tensors, ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
|
||||
)
|
||||
timers.append(silu_mul_quant_timer)
|
||||
reference_timer = bench_impl(bench_tensors, ImplType.REFERENCE)
|
||||
timers.append(reference_timer)
|
||||
|
||||
print_timers(
|
||||
[silu_mul_quant_timer, reference_timer], cuda_graph_nops=arg_pool_size
|
||||
)
|
||||
|
||||
print_timers(timers, cuda_graph_nops=arg_pool_size)
|
||||
|
||||
return timers
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
T = [128 * i for i in range(1, 16)] + [2048 * i for i in range(1, 65)]
|
||||
N = [2048, 4096, 8192]
|
||||
|
||||
print(f"T = {T}, N = {N}")
|
||||
run(T, N, arg_pool_size=8)
|
||||
@@ -13,8 +13,8 @@ from vllm.triton_utils import triton
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
batch_size_range = [1, 16, 128]
|
||||
seq_len_range = [1, 16, 64, 1024, 4096]
|
||||
batch_size_range = [1, 16, 32, 64, 128]
|
||||
seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
|
||||
intermediate_size = [3072, 9728, 12288]
|
||||
configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size))
|
||||
|
||||
|
||||
@@ -1,150 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Benchmark script comparing torch.cat vs direct copy for k_nope/k_pe concatenation
|
||||
in MLA (Multi-head Latent Attention) prefill.
|
||||
|
||||
This validates that the optimization from commit 8d4142bd is beneficial across
|
||||
various batch sizes, not just the originally tested batch size of 32768.
|
||||
"""
|
||||
|
||||
import time
|
||||
from collections.abc import Callable
|
||||
|
||||
import torch
|
||||
|
||||
# DeepSeek-V3 MLA dimensions
|
||||
NUM_HEADS = 128
|
||||
QK_NOPE_HEAD_DIM = 128
|
||||
PE_DIM = 64
|
||||
|
||||
|
||||
def cat_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
|
||||
"""Original torch.cat approach with expand."""
|
||||
return torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
|
||||
|
||||
|
||||
def direct_copy_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
|
||||
"""Optimized direct copy approach (avoids expand + cat overhead)."""
|
||||
k = torch.empty(
|
||||
(*k_nope.shape[:-1], k_nope.shape[-1] + k_pe.shape[-1]),
|
||||
dtype=k_nope.dtype,
|
||||
device=k_nope.device,
|
||||
)
|
||||
k[..., : k_nope.shape[-1]] = k_nope
|
||||
k[..., k_nope.shape[-1] :] = k_pe
|
||||
return k
|
||||
|
||||
|
||||
def benchmark_method(
|
||||
method: Callable,
|
||||
k_nope: torch.Tensor,
|
||||
k_pe: torch.Tensor,
|
||||
num_warmup: int = 10,
|
||||
num_iters: int = 100,
|
||||
) -> float:
|
||||
"""Benchmark a concatenation method and return mean latency in ms."""
|
||||
# Warmup
|
||||
for _ in range(num_warmup):
|
||||
_ = method(k_nope, k_pe)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
# Benchmark
|
||||
start = time.perf_counter()
|
||||
for _ in range(num_iters):
|
||||
_ = method(k_nope, k_pe)
|
||||
torch.cuda.synchronize()
|
||||
end = time.perf_counter()
|
||||
|
||||
return (end - start) / num_iters * 1000 # Convert to ms
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
def run_benchmark(dtype: torch.dtype, dtype_name: str):
|
||||
"""Run benchmark for a specific dtype."""
|
||||
torch.set_default_device("cuda")
|
||||
|
||||
# Batch sizes to test (powers of 2 from 32 to 65536)
|
||||
batch_sizes = [32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536]
|
||||
|
||||
print("=" * 80)
|
||||
print("Benchmark: torch.cat vs direct copy for MLA k_nope/k_pe concatenation")
|
||||
print("=" * 80)
|
||||
print(
|
||||
f"Tensor shapes: k_nope=[B, {NUM_HEADS}, {QK_NOPE_HEAD_DIM}], "
|
||||
f"k_pe=[B, 1, {PE_DIM}]"
|
||||
)
|
||||
print(f"dtype: {dtype_name}")
|
||||
print()
|
||||
print(
|
||||
f"{'Batch Size':>12} | {'cat (ms)':>10} | {'direct (ms)':>12} | "
|
||||
f"{'Speedup':>8} | {'Reduction':>10}"
|
||||
)
|
||||
print("-" * 70)
|
||||
|
||||
results = []
|
||||
for batch_size in batch_sizes:
|
||||
# Create input tensors (generate in float32 then convert for FP8 compatibility)
|
||||
k_nope = torch.randn(
|
||||
batch_size, NUM_HEADS, QK_NOPE_HEAD_DIM, dtype=torch.float32, device="cuda"
|
||||
).to(dtype)
|
||||
k_pe = torch.randn(
|
||||
batch_size, 1, PE_DIM, dtype=torch.float32, device="cuda"
|
||||
).to(dtype)
|
||||
|
||||
# Benchmark both methods
|
||||
cat_time = benchmark_method(cat_method, k_nope, k_pe)
|
||||
direct_time = benchmark_method(direct_copy_method, k_nope, k_pe)
|
||||
|
||||
speedup = cat_time / direct_time
|
||||
reduction = (1 - direct_time / cat_time) * 100
|
||||
|
||||
results.append((batch_size, cat_time, direct_time, speedup, reduction))
|
||||
|
||||
print(
|
||||
f"{batch_size:>12} | {cat_time:>10.3f} | {direct_time:>12.3f} | "
|
||||
f"{speedup:>7.2f}x | {reduction:>9.1f}%"
|
||||
)
|
||||
|
||||
print("=" * 80)
|
||||
|
||||
# Summary statistics
|
||||
speedups = [r[3] for r in results]
|
||||
print("\nSpeedup summary:")
|
||||
print(f" Min: {min(speedups):.2f}x")
|
||||
print(f" Max: {max(speedups):.2f}x")
|
||||
print(f" Mean: {sum(speedups) / len(speedups):.2f}x")
|
||||
|
||||
# Find crossover point
|
||||
crossover_batch = None
|
||||
for batch_size, _, _, speedup, _ in results:
|
||||
if speedup >= 1.0:
|
||||
crossover_batch = batch_size
|
||||
break
|
||||
|
||||
print("\nConclusion:")
|
||||
if crossover_batch:
|
||||
print(f" - Direct copy becomes beneficial at batch size >= {crossover_batch}")
|
||||
# Filter for large batches (>= 512 which is typical for prefill)
|
||||
large_batch_speedups = [r[3] for r in results if r[0] >= 512]
|
||||
if large_batch_speedups:
|
||||
avg_large = sum(large_batch_speedups) / len(large_batch_speedups)
|
||||
print(f" - For batch sizes >= 512: avg speedup = {avg_large:.2f}x")
|
||||
print(" - MLA prefill typically uses large batches, so optimization is effective")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
def main():
|
||||
# Test bfloat16
|
||||
print("\n")
|
||||
run_benchmark(torch.bfloat16, "bfloat16")
|
||||
|
||||
# Test float8_e4m3fn
|
||||
print("\n")
|
||||
run_benchmark(torch.float8_e4m3fn, "float8_e4m3fn")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -24,15 +24,12 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
|
||||
num_tokens_range = [1, 16, 256, 4096]
|
||||
num_experts_range = [16, 64, 224, 256, 280, 512]
|
||||
topk_range = [1, 2, 8]
|
||||
ep_size_range = [1, 8]
|
||||
configs = list(
|
||||
itertools.product(num_tokens_range, num_experts_range, topk_range, ep_size_range)
|
||||
)
|
||||
configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
|
||||
|
||||
|
||||
@triton.testing.perf_report(
|
||||
triton.testing.Benchmark(
|
||||
x_names=["num_tokens", "num_experts", "topk", "ep_size"],
|
||||
x_names=["num_tokens", "num_experts", "topk"],
|
||||
x_vals=configs,
|
||||
line_arg="provider",
|
||||
line_vals=["vllm"],
|
||||
@@ -41,26 +38,16 @@ configs = list(
|
||||
args={},
|
||||
)
|
||||
)
|
||||
def benchmark(num_tokens, num_experts, topk, ep_size, provider):
|
||||
def benchmark(num_tokens, num_experts, topk, provider):
|
||||
"""Benchmark function for Triton."""
|
||||
block_size = 256
|
||||
torch.cuda.manual_seed_all(0)
|
||||
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
|
||||
|
||||
e_map = None
|
||||
if ep_size != 1:
|
||||
local_e = num_experts // ep_size
|
||||
e_ids = torch.randperm(num_experts, device="cuda", dtype=torch.int32)[:local_e]
|
||||
e_map = torch.full((num_experts,), -1, device="cuda", dtype=torch.int32)
|
||||
e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32)
|
||||
|
||||
quantiles = [0.5, 0.2, 0.8]
|
||||
|
||||
if provider == "vllm":
|
||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||
lambda: moe_align_block_size(
|
||||
topk_ids, block_size, num_experts, e_map, ignore_invalid_experts=True
|
||||
),
|
||||
lambda: moe_align_block_size(topk_ids, block_size, num_experts),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
|
||||
|
||||
@@ -99,6 +99,7 @@ def benchmark_mrope(
|
||||
# the parameters to compute the q k v size based on tp_size
|
||||
mrope_helper_class = get_rope(
|
||||
head_size=head_dim,
|
||||
rotary_dim=head_dim,
|
||||
max_position=max_position,
|
||||
is_neox_style=is_neox_style,
|
||||
rope_parameters=rope_parameters,
|
||||
|
||||
@@ -32,8 +32,8 @@ def get_benchmark(head_size, rotary_dim, is_neox_style, device):
|
||||
def benchmark(batch_size, seq_len, num_heads, provider):
|
||||
dtype = torch.bfloat16
|
||||
max_position = 8192
|
||||
rope_parameters = {"partial_rotary_factor": rotary_dim / head_size}
|
||||
rope = get_rope(head_size, max_position, is_neox_style, rope_parameters)
|
||||
base = 10000
|
||||
rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
|
||||
rope = rope.to(dtype=dtype, device=device)
|
||||
cos_sin_cache = rope.cos_sin_cache.to(dtype=torch.float, device=device)
|
||||
|
||||
|
||||
@@ -251,6 +251,17 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
|
||||
endif()
|
||||
|
||||
# Build ACL with CMake
|
||||
set(ARM_COMPUTE_BUILD_SHARED_LIB "OFF")
|
||||
set(CMAKE_BUILD_TYPE "Release")
|
||||
set(ARM_COMPUTE_ARCH "armv8.2-a")
|
||||
set(ARM_COMPUTE_ENABLE_ASSERTS "OFF")
|
||||
set(ARM_COMPUTE_ENABLE_CPPTHREADS "OFF")
|
||||
set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
|
||||
set(ARM_COMPUTE_ENABLE_OPENMP "ON")
|
||||
set(ARM_COMPUTE_ENABLE_WERROR "OFF")
|
||||
set(ARM_COMPUTE_BUILD_EXAMPLES "OFF")
|
||||
set(ARM_COMPUTE_BUILD_TESTING "OFF")
|
||||
|
||||
set(_cmake_config_cmd
|
||||
${CMAKE_COMMAND} -G Ninja -B build
|
||||
-DARM_COMPUTE_BUILD_SHARED_LIB=OFF
|
||||
@@ -330,7 +341,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
|
||||
PUBLIC ${oneDNN_BINARY_DIR}/include
|
||||
PRIVATE ${oneDNN_SOURCE_DIR}/src
|
||||
)
|
||||
target_link_libraries(dnnl_ext dnnl torch)
|
||||
target_link_libraries(dnnl_ext dnnl)
|
||||
target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS} -fPIC)
|
||||
list(APPEND LIBS dnnl_ext)
|
||||
set(USE_ONEDNN ON)
|
||||
@@ -358,13 +369,13 @@ set(VLLM_EXT_SRC
|
||||
"csrc/cpu/pos_encoding.cpp"
|
||||
"csrc/moe/dynamic_4bit_int_moe_cpu.cpp"
|
||||
"csrc/cpu/cpu_attn.cpp"
|
||||
"csrc/cpu/scratchpad_manager.cpp"
|
||||
"csrc/cpu/torch_bindings.cpp")
|
||||
|
||||
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||
set(VLLM_EXT_SRC
|
||||
"csrc/cpu/shm.cpp"
|
||||
"csrc/cpu/cpu_wna16.cpp"
|
||||
"csrc/cpu/cpu_fused_moe.cpp"
|
||||
${VLLM_EXT_SRC})
|
||||
if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI)
|
||||
set(VLLM_EXT_SRC
|
||||
|
||||
@@ -35,21 +35,16 @@ message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
|
||||
# sm90a
|
||||
|
||||
set(SUPPORT_ARCHS)
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3)
|
||||
list(APPEND SUPPORT_ARCHS "9.0a")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3)
|
||||
list(APPEND SUPPORT_ARCHS 9.0a)
|
||||
endif()
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.9)
|
||||
# CUDA 12.9 has introduced "Family-Specific Architecture Features"
|
||||
# this supports all compute_10x family
|
||||
list(APPEND SUPPORT_ARCHS "10.0f")
|
||||
elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
||||
list(APPEND SUPPORT_ARCHS "10.0a")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8)
|
||||
list(APPEND SUPPORT_ARCHS 10.0a)
|
||||
endif()
|
||||
|
||||
|
||||
cuda_archs_loose_intersection(FLASH_MLA_ARCHS "${SUPPORT_ARCHS}" "${CUDA_ARCHS}")
|
||||
if(FLASH_MLA_ARCHS)
|
||||
message(STATUS "FlashMLA CUDA architectures: ${FLASH_MLA_ARCHS}")
|
||||
set(VLLM_FLASHMLA_GPU_FLAGS ${VLLM_GPU_FLAGS})
|
||||
list(APPEND VLLM_FLASHMLA_GPU_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math")
|
||||
|
||||
@@ -131,8 +126,7 @@ if(FLASH_MLA_ARCHS)
|
||||
$<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
|
||||
$<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
|
||||
else()
|
||||
message(STATUS "FlashMLA will not compile: unsupported CUDA architecture ${CUDA_ARCHS}")
|
||||
# Create empty targets for setup.py on unsupported systems
|
||||
# Create empty targets for setup.py when not targeting sm90a systems
|
||||
add_custom_target(_flashmla_C)
|
||||
add_custom_target(_flashmla_extension_C)
|
||||
endif()
|
||||
|
||||
@@ -140,21 +140,16 @@ function(vllm_prepare_torch_gomp_shim TORCH_GOMP_SHIM_DIR)
|
||||
run_python(_VLLM_TORCH_GOMP_PATH
|
||||
"
|
||||
import os, glob
|
||||
import torch
|
||||
torch_pkg = os.path.dirname(torch.__file__)
|
||||
site_root = os.path.dirname(torch_pkg)
|
||||
|
||||
# Search both torch.libs and torch/lib
|
||||
roots = [os.path.join(site_root, 'torch.libs'), os.path.join(torch_pkg, 'lib')]
|
||||
candidates = []
|
||||
for root in roots:
|
||||
if not os.path.isdir(root):
|
||||
continue
|
||||
candidates.extend(glob.glob(os.path.join(root, 'libgomp*.so*')))
|
||||
|
||||
print(candidates[0] if candidates else '')
|
||||
try:
|
||||
import torch
|
||||
torch_pkg = os.path.dirname(torch.__file__)
|
||||
site_root = os.path.dirname(torch_pkg)
|
||||
torch_libs = os.path.join(site_root, 'torch.libs')
|
||||
print(glob.glob(os.path.join(torch_libs, 'libgomp-*.so*'))[0])
|
||||
except:
|
||||
print('')
|
||||
"
|
||||
"failed to probe for libgomp")
|
||||
"failed to probe torch.libs for libgomp")
|
||||
|
||||
if(_VLLM_TORCH_GOMP_PATH STREQUAL "" OR NOT EXISTS "${_VLLM_TORCH_GOMP_PATH}")
|
||||
return()
|
||||
|
||||
@@ -15,61 +15,19 @@ __device__ __forceinline__ scalar_t compute(const scalar_t& x,
|
||||
const scalar_t& y) {
|
||||
return act_first ? ACT_FN(x) * y : x * ACT_FN(y);
|
||||
}
|
||||
|
||||
// Check if all pointers are 16-byte aligned for int4 vectorized access
|
||||
__device__ __forceinline__ bool is_16byte_aligned(const void* ptr) {
|
||||
return (reinterpret_cast<uintptr_t>(ptr) & 15) == 0;
|
||||
}
|
||||
|
||||
// Activation and gating kernel template.
|
||||
|
||||
template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
|
||||
bool act_first>
|
||||
__global__ void act_and_mul_kernel(
|
||||
scalar_t* __restrict__ out, // [..., d]
|
||||
const scalar_t* __restrict__ input, // [..., 2, d]
|
||||
const int d) {
|
||||
constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
|
||||
const int64_t token_idx = blockIdx.x;
|
||||
const scalar_t* x_ptr = input + token_idx * 2 * d;
|
||||
const scalar_t* y_ptr = x_ptr + d;
|
||||
scalar_t* out_ptr = out + token_idx * d;
|
||||
|
||||
// Check alignment for 128-bit vectorized access.
|
||||
// All three pointers must be 16-byte aligned for safe int4 operations.
|
||||
const bool aligned = is_16byte_aligned(x_ptr) && is_16byte_aligned(y_ptr) &&
|
||||
is_16byte_aligned(out_ptr);
|
||||
|
||||
if (aligned && d >= VEC_SIZE) {
|
||||
// Fast path: 128-bit vectorized loop
|
||||
const int4* x_vec = reinterpret_cast<const int4*>(x_ptr);
|
||||
const int4* y_vec = reinterpret_cast<const int4*>(y_ptr);
|
||||
int4* out_vec = reinterpret_cast<int4*>(out_ptr);
|
||||
const int num_vecs = d / VEC_SIZE;
|
||||
const int vec_end = num_vecs * VEC_SIZE;
|
||||
|
||||
for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
|
||||
int4 x = VLLM_LDG(&x_vec[i]), y = VLLM_LDG(&y_vec[i]), r;
|
||||
auto* xp = reinterpret_cast<scalar_t*>(&x);
|
||||
auto* yp = reinterpret_cast<scalar_t*>(&y);
|
||||
auto* rp = reinterpret_cast<scalar_t*>(&r);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < VEC_SIZE; j++) {
|
||||
rp[j] = compute<scalar_t, ACT_FN, act_first>(xp[j], yp[j]);
|
||||
}
|
||||
out_vec[i] = r;
|
||||
}
|
||||
// Scalar cleanup for remaining elements
|
||||
for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
|
||||
out_ptr[i] = compute<scalar_t, ACT_FN, act_first>(VLLM_LDG(&x_ptr[i]),
|
||||
VLLM_LDG(&y_ptr[i]));
|
||||
}
|
||||
} else {
|
||||
// Scalar fallback for unaligned data or small d
|
||||
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
|
||||
const scalar_t x = VLLM_LDG(&x_ptr[idx]);
|
||||
const scalar_t y = VLLM_LDG(&y_ptr[idx]);
|
||||
out_ptr[idx] = compute<scalar_t, ACT_FN, act_first>(x, y);
|
||||
}
|
||||
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
|
||||
const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
|
||||
const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
|
||||
out[token_idx * d + idx] = compute<scalar_t, ACT_FN, act_first>(x, y);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -162,115 +120,50 @@ template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&, const float)>
|
||||
__global__ void act_and_mul_kernel_with_param(
|
||||
scalar_t* __restrict__ out, const scalar_t* __restrict__ input, const int d,
|
||||
const float param) {
|
||||
constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
|
||||
const int64_t token_idx = blockIdx.x;
|
||||
const scalar_t* x_ptr = input + token_idx * 2 * d;
|
||||
const scalar_t* y_ptr = x_ptr + d;
|
||||
scalar_t* out_ptr = out + token_idx * d;
|
||||
|
||||
// Check alignment for 128-bit vectorized access
|
||||
const bool aligned = is_16byte_aligned(x_ptr) && is_16byte_aligned(y_ptr) &&
|
||||
is_16byte_aligned(out_ptr);
|
||||
|
||||
if (aligned && d >= VEC_SIZE) {
|
||||
// Fast path: 128-bit vectorized loop
|
||||
const int4* x_vec = reinterpret_cast<const int4*>(x_ptr);
|
||||
const int4* y_vec = reinterpret_cast<const int4*>(y_ptr);
|
||||
int4* out_vec = reinterpret_cast<int4*>(out_ptr);
|
||||
const int num_vecs = d / VEC_SIZE;
|
||||
const int vec_end = num_vecs * VEC_SIZE;
|
||||
|
||||
for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
|
||||
int4 x = VLLM_LDG(&x_vec[i]), y = VLLM_LDG(&y_vec[i]), r;
|
||||
auto* xp = reinterpret_cast<scalar_t*>(&x);
|
||||
auto* yp = reinterpret_cast<scalar_t*>(&y);
|
||||
auto* rp = reinterpret_cast<scalar_t*>(&r);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < VEC_SIZE; j++) {
|
||||
rp[j] = ACT_FN(xp[j], param) * yp[j];
|
||||
}
|
||||
out_vec[i] = r;
|
||||
}
|
||||
// Scalar cleanup for remaining elements
|
||||
for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
|
||||
out_ptr[i] = ACT_FN(VLLM_LDG(&x_ptr[i]), param) * VLLM_LDG(&y_ptr[i]);
|
||||
}
|
||||
} else {
|
||||
// Scalar fallback for unaligned data or small d
|
||||
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
|
||||
const scalar_t x = VLLM_LDG(&x_ptr[idx]);
|
||||
const scalar_t y = VLLM_LDG(&y_ptr[idx]);
|
||||
out_ptr[idx] = ACT_FN(x, param) * y;
|
||||
}
|
||||
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
|
||||
const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
|
||||
const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
|
||||
out[token_idx * d + idx] = ACT_FN(x, param) * y;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T swigluoai_and_mul(const T& gate, const T& up,
|
||||
float alpha, float limit) {
|
||||
// Clamp gate to (-inf, limit] and up to [-limit, limit]
|
||||
const float g = fminf((float)gate, limit);
|
||||
const float u = fmaxf(fminf((float)up, limit), -limit);
|
||||
// glu = gate * sigmoid(gate * alpha), then return (up + 1) * glu
|
||||
return (T)((u + 1.0f) * g / (1.0f + expf(-g * alpha)));
|
||||
// clamp gate: min=None, max=limit
|
||||
const float gate_f = (float)gate;
|
||||
const float clamped_gate = gate_f > limit ? limit : gate_f;
|
||||
|
||||
// clamp up: min=-limit, max=limit
|
||||
const float up_f = (float)up;
|
||||
const float clamped_up =
|
||||
up_f > limit ? limit : (up_f < -limit ? -limit : up_f);
|
||||
|
||||
// glu = gate * sigmoid(gate * alpha)
|
||||
const float sigmoid_val = 1.0f / (1.0f + expf(-clamped_gate * alpha));
|
||||
const float glu = clamped_gate * sigmoid_val;
|
||||
|
||||
// (up + 1) * glu
|
||||
return (T)((clamped_up + 1.0f) * glu);
|
||||
}
|
||||
|
||||
// Interleaved gate/up: input has [gate0, up0, gate1, up1, ...].
|
||||
template <typename scalar_t,
|
||||
scalar_t (*ACT_FN)(const scalar_t&, const scalar_t&, const float,
|
||||
const float)>
|
||||
__global__ void swigluoai_and_mul_kernel(
|
||||
scalar_t* __restrict__ out, // [..., d]
|
||||
const scalar_t* __restrict__ input, // [..., 2 * d] (interleaved)
|
||||
const scalar_t* __restrict__ input, // [..., 2, d]
|
||||
const int d, const float alpha, const float limit) {
|
||||
// For interleaved data: input has 2*d elements per token (gate/up pairs)
|
||||
// output has d elements per token
|
||||
constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
|
||||
constexpr int PAIRS = VEC_SIZE / 2; // Number of gate/up pairs per int4 load
|
||||
const int64_t token_idx = blockIdx.x;
|
||||
const scalar_t* in_ptr = input + token_idx * 2 * d;
|
||||
scalar_t* out_ptr = out + token_idx * d;
|
||||
// TODO: Vectorize loads and stores.
|
||||
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
|
||||
// gate = x[..., ::2] (even indices)
|
||||
const scalar_t gate = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx]);
|
||||
// up = x[..., 1::2] (odd indices)
|
||||
const scalar_t up = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx + 1]);
|
||||
|
||||
// Check alignment for 128-bit vectorized access on input.
|
||||
// For output we use int2 (64-bit) which has 8-byte alignment requirement.
|
||||
const bool in_aligned = is_16byte_aligned(in_ptr);
|
||||
const bool out_aligned =
|
||||
(reinterpret_cast<uintptr_t>(out_ptr) & 7) == 0; // 8-byte for int2
|
||||
|
||||
if (in_aligned && out_aligned && d >= PAIRS) {
|
||||
// Fast path: vectorized loop
|
||||
// Each int4 load gives VEC_SIZE elements = PAIRS gate/up pairs
|
||||
// Each int2 store writes PAIRS output elements
|
||||
const int4* in_vec = reinterpret_cast<const int4*>(in_ptr);
|
||||
int2* out_vec = reinterpret_cast<int2*>(out_ptr);
|
||||
const int num_vecs = d / PAIRS;
|
||||
const int vec_end = num_vecs * PAIRS;
|
||||
|
||||
for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
|
||||
int4 v = VLLM_LDG(&in_vec[i]);
|
||||
int2 r;
|
||||
auto* vp = reinterpret_cast<scalar_t*>(&v);
|
||||
auto* rp = reinterpret_cast<scalar_t*>(&r);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < PAIRS; j++) {
|
||||
rp[j] = ACT_FN(vp[2 * j], vp[2 * j + 1], alpha, limit);
|
||||
}
|
||||
out_vec[i] = r;
|
||||
}
|
||||
// Scalar cleanup for remaining elements
|
||||
for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
|
||||
out_ptr[i] = ACT_FN(VLLM_LDG(&in_ptr[2 * i]),
|
||||
VLLM_LDG(&in_ptr[2 * i + 1]), alpha, limit);
|
||||
}
|
||||
} else {
|
||||
// Scalar fallback for unaligned data or small d
|
||||
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
|
||||
// gate = x[..., ::2] (even indices)
|
||||
const scalar_t gate = VLLM_LDG(&in_ptr[2 * idx]);
|
||||
// up = x[..., 1::2] (odd indices)
|
||||
const scalar_t up = VLLM_LDG(&in_ptr[2 * idx + 1]);
|
||||
out_ptr[idx] = ACT_FN(gate, up, alpha, limit);
|
||||
}
|
||||
out[token_idx * d + idx] = ACT_FN(gate, up, alpha, limit);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -324,41 +217,10 @@ __global__ void activation_kernel(
|
||||
scalar_t* __restrict__ out, // [..., d]
|
||||
const scalar_t* __restrict__ input, // [..., d]
|
||||
const int d) {
|
||||
constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
|
||||
const int64_t token_idx = blockIdx.x;
|
||||
const scalar_t* in_ptr = input + token_idx * d;
|
||||
scalar_t* out_ptr = out + token_idx * d;
|
||||
|
||||
// Check alignment for 128-bit vectorized access
|
||||
const bool aligned = is_16byte_aligned(in_ptr) && is_16byte_aligned(out_ptr);
|
||||
|
||||
if (aligned && d >= VEC_SIZE) {
|
||||
// Fast path: 128-bit vectorized loop
|
||||
const int4* in_vec = reinterpret_cast<const int4*>(in_ptr);
|
||||
int4* out_vec = reinterpret_cast<int4*>(out_ptr);
|
||||
const int num_vecs = d / VEC_SIZE;
|
||||
const int vec_end = num_vecs * VEC_SIZE;
|
||||
|
||||
for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
|
||||
int4 v = VLLM_LDG(&in_vec[i]), r;
|
||||
auto* vp = reinterpret_cast<scalar_t*>(&v);
|
||||
auto* rp = reinterpret_cast<scalar_t*>(&r);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < VEC_SIZE; j++) {
|
||||
rp[j] = ACT_FN(vp[j]);
|
||||
}
|
||||
out_vec[i] = r;
|
||||
}
|
||||
// Scalar cleanup for remaining elements
|
||||
for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
|
||||
out_ptr[i] = ACT_FN(VLLM_LDG(&in_ptr[i]));
|
||||
}
|
||||
} else {
|
||||
// Scalar fallback for unaligned data or small d
|
||||
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
|
||||
const scalar_t x = VLLM_LDG(&in_ptr[idx]);
|
||||
out_ptr[idx] = ACT_FN(x);
|
||||
}
|
||||
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
|
||||
const scalar_t x = VLLM_LDG(&input[token_idx * d + idx]);
|
||||
out[token_idx * d + idx] = ACT_FN(x);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
12
csrc/cache.h
12
csrc/cache.h
@@ -1,7 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include <torch/all.h>
|
||||
#include <c10/util/Optional.h>
|
||||
|
||||
#include <map>
|
||||
#include <vector>
|
||||
@@ -59,15 +58,6 @@ void cp_gather_cache(
|
||||
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
||||
int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
|
||||
|
||||
// Gather and upconvert FP8 KV cache to BF16 workspace
|
||||
void cp_gather_and_upconvert_fp8_kv_cache(
|
||||
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, 656]
|
||||
torch::Tensor const& dst, // [TOT_TOKENS, 576]
|
||||
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
||||
torch::Tensor const& seq_lens, // [BATCH]
|
||||
torch::Tensor const& workspace_starts, // [BATCH]
|
||||
int64_t batch_size);
|
||||
|
||||
// Indexer K quantization and cache function
|
||||
void indexer_k_quant_and_cache(
|
||||
torch::Tensor& k, // [num_tokens, head_dim]
|
||||
@@ -82,4 +72,4 @@ void cp_gather_indexer_k_quant_cache(
|
||||
torch::Tensor& dst_k, // [num_tokens, head_dim]
|
||||
torch::Tensor& dst_scale, // [num_tokens, head_dim / quant_block_size * 4]
|
||||
const torch::Tensor& block_table, // [batch_size, num_blocks]
|
||||
const torch::Tensor& cu_seq_lens); // [batch_size + 1]
|
||||
const torch::Tensor& cu_seq_lens); // [batch_size + 1]
|
||||
@@ -2,7 +2,6 @@
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <c10/cuda/CUDAException.h>
|
||||
#include <c10/util/Optional.h>
|
||||
|
||||
#include "cuda_utils.h"
|
||||
#include "cuda_compat.h"
|
||||
@@ -515,8 +514,7 @@ __global__ void indexer_k_quant_and_cache_kernel(
|
||||
const int quant_block_size, // quantization block size
|
||||
const int cache_block_size, // cache block size
|
||||
const int cache_stride, // stride for each token in kv_cache
|
||||
|
||||
const bool use_ue8m0 // use ue8m0 scale format
|
||||
const bool use_ue8m0 // use ue8m0 scale format
|
||||
) {
|
||||
constexpr int VEC_SIZE = 4;
|
||||
const int64_t token_idx = blockIdx.x;
|
||||
@@ -1063,82 +1061,6 @@ void gather_and_maybe_dequant_cache(
|
||||
}
|
||||
|
||||
namespace vllm {
|
||||
|
||||
// Gather and upconvert FP8 KV cache tokens to BF16 workspace
|
||||
// Similar to cp_gather_cache but specifically for FP8->BF16 conversion
|
||||
__global__ void cp_gather_and_upconvert_fp8_kv_cache(
|
||||
const uint8_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE, 656]
|
||||
__nv_bfloat16* __restrict__ dst, // [TOT_TOKENS, 576]
|
||||
const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES]
|
||||
const int32_t* __restrict__ seq_lens, // [BATCH]
|
||||
const int32_t* __restrict__ workspace_starts, // [BATCH]
|
||||
const int32_t block_size, const int32_t head_dim,
|
||||
const int64_t block_table_stride, const int64_t cache_block_stride,
|
||||
const int64_t cache_entry_stride, const int64_t dst_entry_stride) {
|
||||
const int64_t bid = blockIdx.x; // Batch ID
|
||||
const int32_t num_splits = gridDim.y;
|
||||
const int32_t split = blockIdx.y;
|
||||
const int32_t seq_start = workspace_starts[bid];
|
||||
const int32_t seq_len = seq_lens[bid];
|
||||
const int32_t tot_slots = seq_len;
|
||||
const int32_t split_slots = cuda_utils::ceil_div(tot_slots, num_splits);
|
||||
|
||||
const int32_t split_start = split * split_slots;
|
||||
const int32_t split_end = min((split + 1) * split_slots, tot_slots);
|
||||
|
||||
const bool is_active_split = (split_start < tot_slots);
|
||||
|
||||
if (!is_active_split) return;
|
||||
|
||||
// Adjust the pointer for the block_table for this batch
|
||||
const int32_t batch_offset = bid * block_table_stride;
|
||||
int32_t offset = split_start;
|
||||
int32_t offset_div = offset / block_size;
|
||||
offset = offset % block_size;
|
||||
const int32_t* batch_block_table = block_table + batch_offset;
|
||||
|
||||
// Adjust dst pointer based on the cumulative sequence lengths
|
||||
dst += seq_start * dst_entry_stride;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
|
||||
// Process each token in this split
|
||||
for (int pid = split_start; pid < split_end; ++pid) {
|
||||
auto block_id = batch_block_table[offset_div];
|
||||
const uint8_t* token_ptr =
|
||||
src_cache + block_id * cache_block_stride + offset * cache_entry_stride;
|
||||
__nv_bfloat16* dst_ptr = dst + pid * dst_entry_stride;
|
||||
|
||||
// FP8 format: 512 bytes fp8 + 16 bytes scales + 128 bytes rope (64 bf16)
|
||||
const uint8_t* no_pe_ptr = token_ptr;
|
||||
const float* scales_ptr = reinterpret_cast<const float*>(token_ptr + 512);
|
||||
const __nv_bfloat16* rope_ptr =
|
||||
reinterpret_cast<const __nv_bfloat16*>(token_ptr + 512 + 16);
|
||||
|
||||
// Parallelize fp8 dequant (512 elements) and rope copy (64 elements)
|
||||
if (tid < 512) {
|
||||
// FP8 dequantization
|
||||
const int tile = tid >> 7; // each tile is 128 elements
|
||||
const float scale = scales_ptr[tile];
|
||||
const uint8_t val = no_pe_ptr[tid];
|
||||
dst_ptr[tid] =
|
||||
fp8::scaled_convert<__nv_bfloat16, uint8_t,
|
||||
vllm::Fp8KVCacheDataType::kFp8E4M3>(val, scale);
|
||||
} else if (tid < 576) {
|
||||
// Rope copy (64 bf16 elements)
|
||||
const int rope_idx = tid - 512;
|
||||
dst_ptr[512 + rope_idx] = rope_ptr[rope_idx];
|
||||
}
|
||||
|
||||
// Move to next token
|
||||
offset += 1;
|
||||
if (offset == block_size) {
|
||||
offset_div += 1;
|
||||
offset = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
// Note(hc): The cp_gather_cache allows seq_starts to no longer be divisible by
|
||||
// block_size.
|
||||
@@ -1280,57 +1202,6 @@ void cp_gather_cache(
|
||||
}
|
||||
}
|
||||
|
||||
void cp_gather_and_upconvert_fp8_kv_cache(
|
||||
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, 656]
|
||||
torch::Tensor const& dst, // [TOT_TOKENS, 576]
|
||||
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
||||
torch::Tensor const& seq_lens, // [BATCH]
|
||||
torch::Tensor const& workspace_starts, // [BATCH]
|
||||
int64_t batch_size) {
|
||||
at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
int32_t block_size = src_cache.size(1);
|
||||
int32_t head_dim = dst.size(1);
|
||||
|
||||
TORCH_CHECK(block_table.dtype() == torch::kInt32,
|
||||
"block_table must be int32");
|
||||
TORCH_CHECK(seq_lens.dtype() == torch::kInt32, "seq_lens must be int32");
|
||||
TORCH_CHECK(workspace_starts.dtype() == torch::kInt32,
|
||||
"workspace_starts must be int32");
|
||||
|
||||
TORCH_CHECK(src_cache.device() == dst.device(),
|
||||
"src_cache and dst must be on the same device");
|
||||
TORCH_CHECK(src_cache.device() == block_table.device(),
|
||||
"src_cache and block_table must be on the same device");
|
||||
TORCH_CHECK(src_cache.device() == seq_lens.device(),
|
||||
"src_cache and seq_lens must be on the same device");
|
||||
TORCH_CHECK(src_cache.device() == workspace_starts.device(),
|
||||
"src_cache and workspace_starts must be on the same device");
|
||||
|
||||
TORCH_CHECK(src_cache.dtype() == torch::kUInt8, "src_cache must be uint8");
|
||||
TORCH_CHECK(dst.dtype() == torch::kBFloat16, "dst must be bfloat16");
|
||||
TORCH_CHECK(head_dim == 576, "head_dim must be 576 for MLA");
|
||||
|
||||
int64_t block_table_stride = block_table.stride(0);
|
||||
int64_t cache_block_stride = src_cache.stride(0);
|
||||
int64_t cache_entry_stride = src_cache.stride(1);
|
||||
int64_t dst_entry_stride = dst.stride(0);
|
||||
|
||||
// Decide on the number of splits based on the batch size
|
||||
int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
|
||||
dim3 grid(batch_size, num_splits);
|
||||
dim3 block(576);
|
||||
|
||||
vllm::cp_gather_and_upconvert_fp8_kv_cache<<<grid, block, 0, stream>>>(
|
||||
src_cache.data_ptr<uint8_t>(),
|
||||
reinterpret_cast<__nv_bfloat16*>(dst.data_ptr()),
|
||||
block_table.data_ptr<int32_t>(), seq_lens.data_ptr<int32_t>(),
|
||||
workspace_starts.data_ptr<int32_t>(), block_size, head_dim,
|
||||
block_table_stride, cache_block_stride, cache_entry_stride,
|
||||
dst_entry_stride);
|
||||
}
|
||||
|
||||
// Macro to dispatch the kernel based on the data type.
|
||||
#define CALL_INDEXER_K_QUANT_AND_CACHE(KV_T, CACHE_T, KV_DTYPE) \
|
||||
vllm::indexer_k_quant_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE> \
|
||||
|
||||
@@ -117,6 +117,7 @@ torch::Tensor get_scheduler_metadata(
|
||||
input.casual = casual;
|
||||
input.isa = isa;
|
||||
input.enable_kv_split = enable_kv_split;
|
||||
TORCH_CHECK(casual, "Only supports casual mask for now.");
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() {
|
||||
CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
|
||||
|
||||
@@ -8,8 +8,10 @@
|
||||
#include <sys/sysctl.h>
|
||||
#endif
|
||||
|
||||
#include "cpu/cpu_arch_macros.h"
|
||||
#include "cpu/utils.hpp"
|
||||
#include "cpu_types.hpp"
|
||||
#include "scratchpad_manager.h"
|
||||
#include "cpu_attn_macros.h"
|
||||
#include "utils.hpp"
|
||||
|
||||
namespace cpu_attention {
|
||||
enum class ISA { AMX, VEC, VEC16, NEON };
|
||||
@@ -184,7 +186,7 @@ struct AttentionMetadata {
|
||||
// - Intermediate outputs: q_tile_size * head_dim * output_buffer_elem_size + 2
|
||||
// * q_tile_size * 4, partial output, max + sum (float)
|
||||
// Reduction scratchpad contains:
|
||||
// - flags: bool array to indicate whether the split is finished
|
||||
// - flags: bool array to indicate wether the split is finished
|
||||
// - outputs: split_num * q_tile_size * head_dim * output_buffer_elem_size
|
||||
// - max, sum: 2 * split_num * q_tile_size * 4
|
||||
class AttentionScratchPad {
|
||||
@@ -376,13 +378,12 @@ class AttentionScheduler {
|
||||
|
||||
static constexpr int32_t MaxQTileIterNum = 128;
|
||||
|
||||
AttentionScheduler()
|
||||
: available_cache_size_(cpu_utils::get_available_l2_size()) {}
|
||||
AttentionScheduler() : available_cache_size_(get_available_l2_size()) {}
|
||||
|
||||
torch::Tensor schedule(const ScheduleInput& input) const {
|
||||
const bool casual = input.casual;
|
||||
const int32_t thread_num = omp_get_max_threads();
|
||||
const int64_t cache_size = cpu_utils::get_available_l2_size();
|
||||
const int64_t cache_size = get_available_l2_size();
|
||||
const int32_t max_num_q_per_iter = input.max_num_q_per_iter;
|
||||
const int32_t kv_len_alignment = input.kv_block_alignment;
|
||||
int32_t q_head_per_kv = input.num_heads_q / input.num_heads_kv;
|
||||
@@ -658,7 +659,7 @@ class AttentionScheduler {
|
||||
metadata_ptr->thread_num +
|
||||
metadata_ptr->reduction_scratchpad_size_per_kv_head *
|
||||
(use_gqa ? input.num_heads_kv : input.num_heads_q);
|
||||
cpu_utils::ScratchPadManager::get_scratchpad_manager()->realloc(
|
||||
DNNLScratchPadManager::get_dnnl_scratchpad_manager()->realloc(
|
||||
scratchpad_size);
|
||||
|
||||
// metadata_ptr->print();
|
||||
@@ -666,7 +667,7 @@ class AttentionScheduler {
|
||||
// test out of boundary access
|
||||
// {
|
||||
// float* cache_ptr =
|
||||
// cpu_utils::ScratchPadManager::getl_scratchpad_manager()->get_data<float>();
|
||||
// DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<float>();
|
||||
// for (int64_t i = 0; i < scratchpad_size / sizeof(float); ++i) {
|
||||
// cache_ptr[i] = std::numeric_limits<float>::quiet_NaN();
|
||||
// }
|
||||
@@ -748,6 +749,27 @@ class AttentionScheduler {
|
||||
return std::max(rounded_tile_size, round_size);
|
||||
}
|
||||
|
||||
static int64_t get_available_l2_size() {
|
||||
static int64_t size = []() {
|
||||
#if defined(__APPLE__)
|
||||
// macOS doesn't have _SC_LEVEL2_CACHE_SIZE. Use sysctlbyname.
|
||||
int64_t l2_cache_size = 0;
|
||||
size_t len = sizeof(l2_cache_size);
|
||||
if (sysctlbyname("hw.l2cachesize", &l2_cache_size, &len, NULL, 0) == 0 &&
|
||||
l2_cache_size > 0) {
|
||||
return l2_cache_size >> 1; // use 50% of L2 cache
|
||||
}
|
||||
// Fallback if sysctlbyname fails
|
||||
return 128LL * 1024 >> 1; // use 50% of 128KB
|
||||
#else
|
||||
long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE);
|
||||
TORCH_CHECK_NE(l2_cache_size, -1);
|
||||
return l2_cache_size >> 1; // use 50% of L2 cache
|
||||
#endif
|
||||
}();
|
||||
return size;
|
||||
}
|
||||
|
||||
private:
|
||||
int64_t available_cache_size_;
|
||||
};
|
||||
@@ -1224,8 +1246,14 @@ class AttentionMainLoop {
|
||||
// rescale sum and partial outputs
|
||||
if (need_rescale) {
|
||||
// compute rescale factor
|
||||
#ifdef DEFINE_FAST_EXP
|
||||
vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
|
||||
rescale_factor_vec = fast_exp(rescale_factor_vec);
|
||||
rescale_factor = rescale_factor_vec.get_last_elem();
|
||||
#else
|
||||
rescale_factor = std::exp(rescale_factor);
|
||||
vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
|
||||
#endif
|
||||
|
||||
// rescale sum
|
||||
new_sum_val += rescale_factor * init_sum_val;
|
||||
@@ -1380,7 +1408,7 @@ class AttentionMainLoop {
|
||||
|
||||
// init buffers
|
||||
void* scratchpad_ptr =
|
||||
cpu_utils::ScratchPadManager::get_scratchpad_manager()
|
||||
DNNLScratchPadManager::get_dnnl_scratchpad_manager()
|
||||
->get_data<void>();
|
||||
AttentionScratchPad buffer_manager(thread_id, metadata, scratchpad_ptr);
|
||||
|
||||
@@ -1400,7 +1428,8 @@ class AttentionMainLoop {
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t available_cache_size = cpu_utils::get_available_l2_size();
|
||||
const int64_t available_cache_size =
|
||||
AttentionScheduler::get_available_l2_size();
|
||||
const int32_t default_tile_size =
|
||||
AttentionScheduler::calcu_default_tile_size(
|
||||
available_cache_size, head_dim, sizeof(kv_cache_t),
|
||||
@@ -1860,8 +1889,15 @@ class AttentionMainLoop {
|
||||
: curr_output_buffer;
|
||||
float rescale_factor = final_max > curr_max ? curr_max - final_max
|
||||
: final_max - curr_max;
|
||||
|
||||
#ifdef DEFINE_FAST_EXP
|
||||
vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
|
||||
rescale_factor_vec = fast_exp(rescale_factor_vec);
|
||||
rescale_factor = rescale_factor_vec.get_last_elem();
|
||||
#else
|
||||
rescale_factor = std::exp(rescale_factor);
|
||||
vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
|
||||
#endif
|
||||
|
||||
local_sum[head_idx] = final_max > curr_max
|
||||
? final_sum + rescale_factor * curr_sum
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#ifndef CPU_ARCH_MACROS_H
|
||||
#define CPU_ARCH_MACROS_H
|
||||
#ifndef CPU_ATTN_MACROS_H
|
||||
#define CPU_ATTN_MACROS_H
|
||||
|
||||
// x86_64
|
||||
#ifdef __x86_64__
|
||||
@@ -26,7 +26,7 @@
|
||||
_mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218)); \
|
||||
const __m512i vec_127 = _mm512_set1_epi32(0x0000007f); \
|
||||
const int n_mantissa_bits = 23; \
|
||||
auto fast_exp = [&](const vec_op::FP32Vec16& vec) __attribute__(( \
|
||||
auto fast_exp = [&](vec_op::FP32Vec16& vec) __attribute__(( \
|
||||
always_inline)) { \
|
||||
__m512 values = vec.reg; \
|
||||
auto less_ln_flt_min_mask = \
|
||||
@@ -60,54 +60,4 @@
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __aarch64__
|
||||
// Implementation copied from Arm Optimized Routines (expf AdvSIMD)
|
||||
// https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/advsimd/expf.c
|
||||
#include <limits>
|
||||
#define DEFINE_FAST_EXP \
|
||||
const float32x4_t inv_ln2 = vdupq_n_f32(0x1.715476p+0f); \
|
||||
const float ln2_hi = 0x1.62e4p-1f; \
|
||||
const float ln2_lo = 0x1.7f7d1cp-20f; \
|
||||
const float c0 = 0x1.0e4020p-7f; \
|
||||
const float c2 = 0x1.555e66p-3f; \
|
||||
const float32x4_t ln2_c02 = {ln2_hi, ln2_lo, c0, c2}; \
|
||||
const uint32x4_t exponent_bias = vdupq_n_u32(0x3f800000); \
|
||||
const float32x4_t c1 = vdupq_n_f32(0x1.573e2ep-5f); \
|
||||
const float32x4_t c3 = vdupq_n_f32(0x1.fffdb6p-2f); \
|
||||
const float32x4_t c4 = vdupq_n_f32(0x1.ffffecp-1f); \
|
||||
const float32x4_t pos_special_bound = vdupq_n_f32(0x1.5d5e2ap+6f); \
|
||||
const float32x4_t neg_special_bound = vnegq_f32(pos_special_bound); \
|
||||
const float32x4_t inf = \
|
||||
vdupq_n_f32(std::numeric_limits<float>::infinity()); \
|
||||
const float32x4_t zero = vdupq_n_f32(0.0f); \
|
||||
auto neon_expf = [&](float32x4_t values) __attribute__((always_inline)) { \
|
||||
float32x4_t n = vrndaq_f32(vmulq_f32(values, inv_ln2)); \
|
||||
float32x4_t r = vfmsq_laneq_f32(values, n, ln2_c02, 0); \
|
||||
r = vfmsq_laneq_f32(r, n, ln2_c02, 1); \
|
||||
uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_s32(vcvtq_s32_f32(n)), 23); \
|
||||
float32x4_t scale = vreinterpretq_f32_u32(vaddq_u32(e, exponent_bias)); \
|
||||
float32x4_t r2 = vmulq_f32(r, r); \
|
||||
float32x4_t p = vfmaq_laneq_f32(c1, r, ln2_c02, 2); \
|
||||
float32x4_t q = vfmaq_laneq_f32(c3, r, ln2_c02, 3); \
|
||||
q = vfmaq_f32(q, p, r2); \
|
||||
p = vmulq_f32(c4, r); \
|
||||
float32x4_t poly = vfmaq_f32(p, q, r2); \
|
||||
poly = vfmaq_f32(scale, poly, scale); \
|
||||
const uint32x4_t hi_mask = vcgeq_f32(values, pos_special_bound); \
|
||||
const uint32x4_t lo_mask = vcleq_f32(values, neg_special_bound); \
|
||||
poly = vbslq_f32(hi_mask, inf, poly); \
|
||||
return vbslq_f32(lo_mask, zero, poly); \
|
||||
}; \
|
||||
auto fast_exp = [&](const vec_op::FP32Vec16& vec) \
|
||||
__attribute__((always_inline)) { \
|
||||
float32x4x4_t result; \
|
||||
result.val[0] = neon_expf(vec.reg.val[0]); \
|
||||
result.val[1] = neon_expf(vec.reg.val[1]); \
|
||||
result.val[2] = neon_expf(vec.reg.val[2]); \
|
||||
result.val[3] = neon_expf(vec.reg.val[3]); \
|
||||
return vec_op::FP32Vec16(result); \
|
||||
};
|
||||
|
||||
#endif // __aarch64__
|
||||
|
||||
#endif
|
||||
#endif
|
||||
@@ -1,727 +0,0 @@
|
||||
#include "cpu/cpu_types.hpp"
|
||||
#include "cpu/utils.hpp"
|
||||
#include "cpu/micro_gemm/cpu_micro_gemm_vec.hpp"
|
||||
#include "cpu/cpu_arch_macros.h"
|
||||
|
||||
#ifdef CPU_CAPABILITY_AMXBF16
|
||||
#include "cpu/micro_gemm/cpu_micro_gemm_amx.hpp"
|
||||
#define AMX_DISPATCH(...) \
|
||||
case cpu_utils::ISA::AMX: { \
|
||||
using gemm_t = cpu_micro_gemm::MicroGemm<cpu_utils::ISA::AMX, scalar_t>; \
|
||||
return __VA_ARGS__(); \
|
||||
}
|
||||
#else
|
||||
#define AMX_DISPATCH(...) case cpu_utils::ISA::AMX:
|
||||
#endif
|
||||
|
||||
#define CPU_ISA_DISPATCH_IMPL(ISA_TYPE, ...) \
|
||||
[&] { \
|
||||
switch (ISA_TYPE) { \
|
||||
AMX_DISPATCH(__VA_ARGS__) \
|
||||
case cpu_utils::ISA::VEC: { \
|
||||
using gemm_t = \
|
||||
cpu_micro_gemm::MicroGemm<cpu_utils::ISA::VEC, scalar_t>; \
|
||||
return __VA_ARGS__(); \
|
||||
} \
|
||||
default: { \
|
||||
TORCH_CHECK(false, "Invalid CPU ISA type."); \
|
||||
} \
|
||||
} \
|
||||
}()
|
||||
|
||||
namespace {
|
||||
enum class FusedMOEAct { SiluAndMul, SwigluOAIAndMul };
|
||||
|
||||
FusedMOEAct get_act_type(const std::string& act) {
|
||||
if (act == "silu") {
|
||||
return FusedMOEAct::SiluAndMul;
|
||||
} else if (act == "swigluoai") {
|
||||
return FusedMOEAct::SwigluOAIAndMul;
|
||||
} else {
|
||||
TORCH_CHECK(false, "Invalid act type: " + act);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
void swigluoai_and_mul(float* __restrict__ input, scalar_t* __restrict__ output,
|
||||
const int32_t m_size, const int32_t n_size,
|
||||
const int32_t input_stride,
|
||||
const int32_t output_stride) {
|
||||
using scalar_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
|
||||
// For GPT-OSS interleaved gate-up weights
|
||||
alignas(64) static int32_t index[16] = {0, 2, 4, 6, 8, 10, 12, 14,
|
||||
16, 18, 20, 22, 24, 26, 28, 30};
|
||||
vec_op::INT32Vec16 index_vec(index);
|
||||
vec_op::FP32Vec16 gate_up_max_vec(7.0);
|
||||
vec_op::FP32Vec16 up_min_vec(-7.0);
|
||||
vec_op::FP32Vec16 alpha_vec(1.702);
|
||||
vec_op::FP32Vec16 one_vec(1.0);
|
||||
|
||||
DEFINE_FAST_EXP
|
||||
|
||||
for (int32_t m = 0; m < m_size; ++m) {
|
||||
for (int32_t n = 0; n < n_size; n += 32) {
|
||||
vec_op::FP32Vec16 gate_vec(input + n, index_vec);
|
||||
vec_op::FP32Vec16 up_vec(input + n + 1, index_vec);
|
||||
gate_vec = gate_vec.min(gate_up_max_vec);
|
||||
up_vec = up_vec.clamp(up_min_vec, gate_up_max_vec);
|
||||
auto sigmoid_vec = one_vec / (one_vec + fast_exp(-gate_vec * alpha_vec));
|
||||
auto glu = gate_vec * sigmoid_vec;
|
||||
auto gated_output_fp32 = (one_vec + up_vec) * glu;
|
||||
scalar_vec_t gated_output = scalar_vec_t(gated_output_fp32);
|
||||
gated_output.save(output + n / 2);
|
||||
}
|
||||
input += input_stride;
|
||||
output += output_stride;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
void silu_and_mul(float* __restrict__ input, scalar_t* __restrict__ output,
|
||||
const int32_t m_size, const int32_t n_size,
|
||||
const int32_t input_stride, const int32_t output_stride) {
|
||||
using scalar_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
|
||||
const int32_t dim = n_size / 2;
|
||||
float* __restrict__ gate = input;
|
||||
float* __restrict__ up = input + dim;
|
||||
vec_op::FP32Vec16 one_vec(1.0);
|
||||
|
||||
DEFINE_FAST_EXP
|
||||
|
||||
for (int32_t m = 0; m < m_size; ++m) {
|
||||
for (int32_t n = 0; n < dim; n += 16) {
|
||||
vec_op::FP32Vec16 gate_vec(gate + n);
|
||||
vec_op::FP32Vec16 up_vec(up + n);
|
||||
auto sigmoid_vec = one_vec / (one_vec + fast_exp(-gate_vec));
|
||||
auto silu = gate_vec * sigmoid_vec;
|
||||
auto gated_output_fp32 = up_vec * silu;
|
||||
scalar_vec_t gated_output = scalar_vec_t(gated_output_fp32);
|
||||
gated_output.save(output + n);
|
||||
}
|
||||
gate += input_stride;
|
||||
up += input_stride;
|
||||
output += output_stride;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
FORCE_INLINE void apply_gated_act(const FusedMOEAct act,
|
||||
float* __restrict__ input,
|
||||
scalar_t* __restrict__ output,
|
||||
const int32_t m, const int32_t n,
|
||||
const int32_t input_stride,
|
||||
const int32_t output_stride) {
|
||||
switch (act) {
|
||||
case FusedMOEAct::SwigluOAIAndMul:
|
||||
swigluoai_and_mul(input, output, m, n, input_stride, output_stride);
|
||||
return;
|
||||
case FusedMOEAct::SiluAndMul:
|
||||
silu_and_mul(input, output, m, n, input_stride, output_stride);
|
||||
return;
|
||||
default:
|
||||
TORCH_CHECK(false, "Unsupported act type.");
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t, typename gemm_t>
|
||||
void prepack_moe_weight_impl(scalar_t* __restrict__ weight_ptr,
|
||||
scalar_t* __restrict__ packed_weight_ptr,
|
||||
const int32_t expert_num,
|
||||
const int32_t output_size,
|
||||
const int32_t input_size,
|
||||
const int64_t expert_stride) {
|
||||
#pragma omp parallel for
|
||||
for (int32_t e_idx = 0; e_idx < expert_num; ++e_idx) {
|
||||
gemm_t::pack_weight(weight_ptr + expert_stride * e_idx,
|
||||
packed_weight_ptr + expert_stride * e_idx, output_size,
|
||||
input_size);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t, typename w_t, typename gemm_t>
|
||||
void fused_moe_impl(scalar_t* __restrict__ output, scalar_t* __restrict__ input,
|
||||
w_t* __restrict__ w13, w_t* __restrict__ w2,
|
||||
w_t* __restrict__ w13_bias, w_t* __restrict__ w2_bias,
|
||||
float* __restrict__ topk_weights,
|
||||
int32_t* __restrict__ topk_id, FusedMOEAct act_type,
|
||||
const int32_t token_num, const int32_t expert_num,
|
||||
const int32_t topk_num, const int32_t input_size_13,
|
||||
const int32_t output_size_13, const int32_t input_size_2,
|
||||
const int32_t output_size_2) {
|
||||
using scalar_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
|
||||
constexpr int32_t gemm_n_tile_size = gemm_t::NSize;
|
||||
constexpr int32_t gemm_m_tile_size = gemm_t::MaxMSize;
|
||||
constexpr int32_t min_w13_n_tile_size = 2 * gemm_n_tile_size;
|
||||
static_assert(gemm_n_tile_size % 16 == 0);
|
||||
|
||||
TORCH_CHECK_EQ(output_size_13 % min_w13_n_tile_size, 0);
|
||||
TORCH_CHECK_EQ(output_size_2 % gemm_n_tile_size, 0);
|
||||
TORCH_CHECK_EQ(output_size_13 / 2, input_size_2);
|
||||
|
||||
const int32_t thread_num = omp_get_max_threads();
|
||||
|
||||
const int32_t w13_input_buffer_size = cpu_utils::round_up<64>(
|
||||
gemm_m_tile_size * input_size_13 * sizeof(scalar_t));
|
||||
|
||||
const int32_t w13_n_tile_size = [&]() {
|
||||
const int64_t cache_size = cpu_utils::get_available_l2_size();
|
||||
// input buffer + output buffer + weight
|
||||
const int32_t n_size_cache_limit =
|
||||
(cache_size - w13_input_buffer_size) /
|
||||
(gemm_m_tile_size * sizeof(float) + input_size_13 * sizeof(scalar_t));
|
||||
const int32_t n_size_thread_limit =
|
||||
output_size_13 / std::max(1, thread_num / topk_num);
|
||||
const int32_t n_size = cpu_utils::round_down<min_w13_n_tile_size>(
|
||||
std::min(n_size_cache_limit, n_size_thread_limit));
|
||||
return std::max(n_size, min_w13_n_tile_size);
|
||||
}();
|
||||
|
||||
const int32_t w2_input_tile_size = cpu_utils::round_up<64>(
|
||||
gemm_m_tile_size * input_size_2 * sizeof(scalar_t));
|
||||
|
||||
const int32_t w2_n_tile_size = [&]() {
|
||||
const int64_t cache_size = cpu_utils::get_available_l2_size();
|
||||
// input tile + weight
|
||||
const int32_t n_size_cache_limit =
|
||||
(cache_size - w2_input_tile_size) / (input_size_2 * sizeof(scalar_t));
|
||||
const int32_t n_size_thread_limit =
|
||||
output_size_2 / std::max(1, thread_num / topk_num);
|
||||
const int32_t n_size = cpu_utils::round_down<gemm_n_tile_size>(
|
||||
std::min(n_size_cache_limit, n_size_thread_limit));
|
||||
return std::max(n_size, gemm_n_tile_size);
|
||||
}();
|
||||
|
||||
// allocate buffers
|
||||
int32_t common_buffer_offset = 0;
|
||||
int32_t w13_thread_buffer_offset = 0;
|
||||
int32_t ws_thread_buffer_offset = 0;
|
||||
|
||||
// common buffers
|
||||
const int32_t token_num_per_group_buffer_size =
|
||||
cpu_utils::round_up<64>(expert_num * sizeof(int32_t));
|
||||
const int32_t token_num_per_group_buffer_offset = common_buffer_offset;
|
||||
common_buffer_offset += token_num_per_group_buffer_size;
|
||||
|
||||
const int32_t cu_token_num_per_group_buffer_size =
|
||||
cpu_utils::round_up<64>((expert_num + 1) * sizeof(int32_t));
|
||||
const int32_t cu_token_num_per_group_buffer_offset = common_buffer_offset;
|
||||
common_buffer_offset += cu_token_num_per_group_buffer_size;
|
||||
|
||||
const int32_t expand_token_id_buffer_size =
|
||||
cpu_utils::round_up<64>(token_num * topk_num * sizeof(int32_t));
|
||||
const int32_t expand_token_id_buffer_offset = common_buffer_offset;
|
||||
common_buffer_offset += expand_token_id_buffer_size;
|
||||
|
||||
const int32_t expand_token_id_index_buffer_size =
|
||||
cpu_utils::round_up<64>(token_num * topk_num * sizeof(int32_t));
|
||||
const int32_t expand_token_id_index_buffer_offset = common_buffer_offset;
|
||||
common_buffer_offset += expand_token_id_index_buffer_size;
|
||||
|
||||
const int32_t w13_gemm_output_buffer_size = cpu_utils::round_up<64>(
|
||||
token_num * topk_num * (output_size_13 / 2) * sizeof(scalar_t));
|
||||
const int32_t w13_gemm_output_buffer_offset = common_buffer_offset;
|
||||
common_buffer_offset += w13_gemm_output_buffer_size;
|
||||
|
||||
const int32_t w2_gemm_output_buffer_size = cpu_utils::round_up<64>(
|
||||
token_num * topk_num * output_size_2 * sizeof(float));
|
||||
const int32_t w2_gemm_output_buffer_offset = common_buffer_offset;
|
||||
common_buffer_offset += w2_gemm_output_buffer_size;
|
||||
|
||||
// w13 GEMM thread buffers
|
||||
const int32_t w13_input_buffer_offset = w13_thread_buffer_offset;
|
||||
w13_thread_buffer_offset += w13_input_buffer_size;
|
||||
|
||||
const int32_t w13_output_buffer_size = cpu_utils::round_up<64>(
|
||||
gemm_m_tile_size * w13_n_tile_size * sizeof(float));
|
||||
const int32_t w13_output_buffer_offset = w13_thread_buffer_offset;
|
||||
w13_thread_buffer_offset += w13_output_buffer_size;
|
||||
|
||||
// Weighted sum thread buffer
|
||||
const int32_t ws_output_buffer_size =
|
||||
cpu_utils::round_up<64>(output_size_2 * sizeof(float));
|
||||
const int32_t ws_output_buffer_offset = ws_thread_buffer_offset;
|
||||
ws_thread_buffer_offset += ws_output_buffer_size;
|
||||
|
||||
const int32_t buffer_size =
|
||||
common_buffer_offset +
|
||||
std::max(w13_thread_buffer_offset, ws_thread_buffer_offset) * thread_num;
|
||||
cpu_utils::ScratchPadManager::get_scratchpad_manager()->realloc(buffer_size);
|
||||
uint8_t* common_buffer_start =
|
||||
cpu_utils::ScratchPadManager::get_scratchpad_manager()
|
||||
->get_data<uint8_t>();
|
||||
uint8_t* thread_buffer_start = common_buffer_start + common_buffer_offset;
|
||||
|
||||
int32_t* __restrict__ token_num_per_group_buffer = reinterpret_cast<int32_t*>(
|
||||
common_buffer_start + token_num_per_group_buffer_offset);
|
||||
int32_t* __restrict__ cu_token_num_per_group_buffer =
|
||||
reinterpret_cast<int32_t*>(common_buffer_start +
|
||||
cu_token_num_per_group_buffer_offset);
|
||||
int32_t* __restrict__ expand_token_id_buffer = reinterpret_cast<int32_t*>(
|
||||
common_buffer_start + expand_token_id_buffer_offset);
|
||||
int32_t* __restrict__ expand_token_id_index_buffer =
|
||||
reinterpret_cast<int32_t*>(common_buffer_start +
|
||||
expand_token_id_index_buffer_offset);
|
||||
|
||||
// prepare token-expert mappings
|
||||
{
|
||||
std::memset(token_num_per_group_buffer, 0, expert_num * sizeof(int32_t));
|
||||
for (int32_t i = 0; i < token_num * topk_num; ++i) {
|
||||
int32_t curr_expert_id = topk_id[i];
|
||||
++token_num_per_group_buffer[curr_expert_id];
|
||||
}
|
||||
|
||||
int32_t token_num_sum = 0;
|
||||
cu_token_num_per_group_buffer[0] = 0;
|
||||
int32_t* token_index_buffer = cu_token_num_per_group_buffer + 1;
|
||||
for (int32_t i = 0; i < expert_num; ++i) {
|
||||
token_index_buffer[i] = token_num_sum;
|
||||
token_num_sum += token_num_per_group_buffer[i];
|
||||
}
|
||||
|
||||
for (int32_t i = 0; i < token_num; ++i) {
|
||||
int32_t* curr_topk_id = topk_id + i * topk_num;
|
||||
int32_t* curr_index_buffer = expand_token_id_index_buffer + i * topk_num;
|
||||
for (int32_t j = 0; j < topk_num; ++j) {
|
||||
int32_t curr_expert_id = curr_topk_id[j];
|
||||
int32_t curr_index = token_index_buffer[curr_expert_id];
|
||||
++token_index_buffer[curr_expert_id];
|
||||
expand_token_id_buffer[curr_index] = i;
|
||||
curr_index_buffer[j] = curr_index;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// w13 GEMM + act
|
||||
{
|
||||
alignas(64) cpu_utils::Counter counter;
|
||||
cpu_utils::Counter* counter_ptr = &counter;
|
||||
|
||||
#pragma omp parallel for schedule(static, 1)
|
||||
for (int32_t thread_id = 0; thread_id < thread_num; ++thread_id) {
|
||||
const int32_t task_num_per_expert =
|
||||
(output_size_13 + w13_n_tile_size - 1) / w13_n_tile_size;
|
||||
const int32_t task_num = task_num_per_expert * expert_num;
|
||||
|
||||
uint8_t* __restrict__ thread_buffer =
|
||||
thread_buffer_start + thread_id * w13_thread_buffer_offset;
|
||||
scalar_t* __restrict__ w13_input_buffer =
|
||||
reinterpret_cast<scalar_t*>(thread_buffer + w13_input_buffer_offset);
|
||||
float* __restrict__ w13_output_buffer =
|
||||
reinterpret_cast<float*>(thread_buffer + w13_output_buffer_offset);
|
||||
scalar_t* __restrict__ w13_gemm_output_buffer =
|
||||
reinterpret_cast<scalar_t*>(common_buffer_start +
|
||||
w13_gemm_output_buffer_offset);
|
||||
|
||||
gemm_t gemm;
|
||||
|
||||
const int32_t input_size_13_bytes = input_size_13 * sizeof(scalar_t);
|
||||
const int32_t w13_n_group_stride = 16 * input_size_13;
|
||||
const int32_t w13_n_tile_stride = gemm_n_tile_size * input_size_13;
|
||||
|
||||
for (;;) {
|
||||
int32_t task_id = counter_ptr->acquire_counter();
|
||||
if (task_id >= task_num) {
|
||||
break;
|
||||
}
|
||||
|
||||
const int32_t curr_expert_id = task_id / task_num_per_expert;
|
||||
const int32_t curr_output_group_id = task_id % task_num_per_expert;
|
||||
const int32_t curr_token_num =
|
||||
token_num_per_group_buffer[curr_expert_id];
|
||||
if (curr_token_num == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const int32_t actual_n_tile_size =
|
||||
std::min(w13_n_tile_size,
|
||||
output_size_13 - curr_output_group_id * w13_n_tile_size);
|
||||
const int32_t* __restrict__ curr_expand_token_id_buffer =
|
||||
expand_token_id_buffer +
|
||||
cu_token_num_per_group_buffer[curr_expert_id];
|
||||
scalar_t* __restrict__ curr_w13_gemm_output_buffer =
|
||||
w13_gemm_output_buffer +
|
||||
cu_token_num_per_group_buffer[curr_expert_id] *
|
||||
(output_size_13 / 2) +
|
||||
curr_output_group_id * w13_n_tile_size / 2;
|
||||
|
||||
w_t* __restrict__ w13_weight_ptr_0 = nullptr;
|
||||
w_t* __restrict__ w13_weight_ptr_1 = nullptr;
|
||||
w_t* __restrict__ w13_bias_ptr_0 = nullptr;
|
||||
w_t* __restrict__ w13_bias_ptr_1 = nullptr;
|
||||
if (act_type == FusedMOEAct::SwigluOAIAndMul) {
|
||||
// For SwigluOAIAndMul, up and down weights are interleaved
|
||||
w13_weight_ptr_0 =
|
||||
w13 + curr_expert_id * input_size_13 * output_size_13 +
|
||||
curr_output_group_id * w13_n_tile_size * input_size_13;
|
||||
w13_weight_ptr_1 =
|
||||
w13_weight_ptr_0 + actual_n_tile_size / 2 * input_size_13;
|
||||
if (w13_bias != nullptr) {
|
||||
w13_bias_ptr_0 = w13_bias + curr_expert_id * output_size_13 +
|
||||
curr_output_group_id * w13_n_tile_size;
|
||||
w13_bias_ptr_1 = w13_bias_ptr_0 + actual_n_tile_size / 2;
|
||||
}
|
||||
} else {
|
||||
w13_weight_ptr_0 =
|
||||
w13 + curr_expert_id * input_size_13 * output_size_13 +
|
||||
curr_output_group_id * (w13_n_tile_size / 2) * input_size_13;
|
||||
w13_weight_ptr_1 =
|
||||
w13_weight_ptr_0 + output_size_13 / 2 * input_size_13;
|
||||
if (w13_bias != nullptr) {
|
||||
w13_bias_ptr_0 = w13_bias + curr_expert_id * output_size_13 +
|
||||
curr_output_group_id * (w13_n_tile_size / 2);
|
||||
w13_bias_ptr_1 = w13_bias_ptr_0 + output_size_13 / 2;
|
||||
}
|
||||
}
|
||||
|
||||
scalar_t* __restrict__ curr_w13_input_buffer = w13_input_buffer;
|
||||
for (int32_t token_idx = 0; token_idx < curr_token_num;
|
||||
token_idx += gemm_m_tile_size) {
|
||||
const int32_t actual_token_num =
|
||||
std::min(gemm_m_tile_size, curr_token_num - token_idx);
|
||||
// copy inputs
|
||||
{
|
||||
scalar_t* __restrict__ curr_w13_input_buffer_iter =
|
||||
curr_w13_input_buffer;
|
||||
for (int32_t i = 0; i < actual_token_num; ++i) {
|
||||
const int32_t curr_token_id = curr_expand_token_id_buffer[i];
|
||||
int8_t* __restrict__ curr_input_iter = reinterpret_cast<int8_t*>(
|
||||
input + curr_token_id * input_size_13);
|
||||
int8_t* __restrict__ curr_output_iter =
|
||||
reinterpret_cast<int8_t*>(curr_w13_input_buffer_iter);
|
||||
int32_t j = 0;
|
||||
for (; j < input_size_13_bytes - 64; j += 64) {
|
||||
vec_op::INT8Vec64 vec(curr_input_iter);
|
||||
vec.save(curr_output_iter);
|
||||
curr_input_iter += 64;
|
||||
curr_output_iter += 64;
|
||||
}
|
||||
vec_op::INT8Vec64 vec(curr_input_iter);
|
||||
vec.save(curr_output_iter, input_size_13_bytes - j);
|
||||
|
||||
// update
|
||||
curr_w13_input_buffer_iter += input_size_13;
|
||||
}
|
||||
// update
|
||||
curr_expand_token_id_buffer += actual_token_num;
|
||||
}
|
||||
|
||||
// gemm + act
|
||||
{
|
||||
scalar_t* __restrict__ w13_weight_ptr_0_iter = w13_weight_ptr_0;
|
||||
scalar_t* __restrict__ w13_weight_ptr_1_iter = w13_weight_ptr_1;
|
||||
scalar_t* __restrict__ w13_bias_ptr_0_iter = w13_bias_ptr_0;
|
||||
scalar_t* __restrict__ w13_bias_ptr_1_iter = w13_bias_ptr_1;
|
||||
scalar_t* __restrict__ curr_w13_input_buffer_iter =
|
||||
curr_w13_input_buffer;
|
||||
float* __restrict__ w13_output_buffer_0_iter = w13_output_buffer;
|
||||
float* __restrict__ w13_output_buffer_1_iter =
|
||||
w13_output_buffer + actual_n_tile_size / 2;
|
||||
for (int32_t i = 0; i < actual_n_tile_size;
|
||||
i += min_w13_n_tile_size) {
|
||||
gemm.gemm(curr_w13_input_buffer_iter, w13_weight_ptr_0_iter,
|
||||
w13_output_buffer_0_iter, actual_token_num,
|
||||
input_size_13, input_size_13, w13_n_group_stride,
|
||||
actual_n_tile_size, false);
|
||||
|
||||
if (w13_bias != nullptr) {
|
||||
cpu_micro_gemm::add_bias_epilogue<gemm_n_tile_size>(
|
||||
w13_output_buffer_0_iter, w13_output_buffer_0_iter,
|
||||
w13_bias_ptr_0_iter, actual_token_num, actual_n_tile_size,
|
||||
actual_n_tile_size);
|
||||
w13_bias_ptr_0_iter += gemm_n_tile_size;
|
||||
}
|
||||
|
||||
gemm.gemm(curr_w13_input_buffer_iter, w13_weight_ptr_1_iter,
|
||||
w13_output_buffer_1_iter, actual_token_num,
|
||||
input_size_13, input_size_13, w13_n_group_stride,
|
||||
actual_n_tile_size, false);
|
||||
|
||||
if (w13_bias != nullptr) {
|
||||
cpu_micro_gemm::add_bias_epilogue<gemm_n_tile_size>(
|
||||
w13_output_buffer_1_iter, w13_output_buffer_1_iter,
|
||||
w13_bias_ptr_1_iter, actual_token_num, actual_n_tile_size,
|
||||
actual_n_tile_size);
|
||||
w13_bias_ptr_1_iter += gemm_n_tile_size;
|
||||
}
|
||||
|
||||
// update
|
||||
w13_weight_ptr_0_iter += w13_n_tile_stride;
|
||||
w13_weight_ptr_1_iter += w13_n_tile_stride;
|
||||
w13_output_buffer_0_iter += gemm_n_tile_size;
|
||||
w13_output_buffer_1_iter += gemm_n_tile_size;
|
||||
}
|
||||
|
||||
apply_gated_act(act_type, w13_output_buffer,
|
||||
curr_w13_gemm_output_buffer, actual_token_num,
|
||||
actual_n_tile_size, actual_n_tile_size,
|
||||
output_size_13 / 2);
|
||||
|
||||
// update
|
||||
curr_w13_gemm_output_buffer +=
|
||||
gemm_m_tile_size * (output_size_13 / 2);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// w2 GEMM
|
||||
{
|
||||
alignas(64) cpu_utils::Counter counter;
|
||||
cpu_utils::Counter* counter_ptr = &counter;
|
||||
|
||||
#pragma omp parallel for schedule(static, 1)
|
||||
for (int32_t thread_id = 0; thread_id < thread_num; ++thread_id) {
|
||||
const int32_t task_num_per_expert =
|
||||
(output_size_2 + w2_n_tile_size - 1) / w2_n_tile_size;
|
||||
const int32_t task_num = task_num_per_expert * expert_num;
|
||||
scalar_t* __restrict__ w13_gemm_output_buffer =
|
||||
reinterpret_cast<scalar_t*>(common_buffer_start +
|
||||
w13_gemm_output_buffer_offset);
|
||||
float* __restrict__ w2_gemm_output_buffer = reinterpret_cast<float*>(
|
||||
common_buffer_start + w2_gemm_output_buffer_offset);
|
||||
|
||||
gemm_t gemm;
|
||||
|
||||
const int32_t w2_n_tile_stride = gemm_n_tile_size * input_size_2;
|
||||
const int32_t w2_n_group_stride = 16 * input_size_2;
|
||||
|
||||
for (;;) {
|
||||
int32_t task_id = counter_ptr->acquire_counter();
|
||||
if (task_id >= task_num) {
|
||||
break;
|
||||
}
|
||||
|
||||
const int32_t curr_expert_id = task_id / task_num_per_expert;
|
||||
const int32_t curr_output_group_id = task_id % task_num_per_expert;
|
||||
const int32_t curr_token_num =
|
||||
token_num_per_group_buffer[curr_expert_id];
|
||||
if (curr_token_num == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const int32_t actual_n_tile_size =
|
||||
std::min(w2_n_tile_size,
|
||||
output_size_2 - curr_output_group_id * w2_n_tile_size);
|
||||
scalar_t* __restrict__ curr_w13_gemm_output_buffer =
|
||||
w13_gemm_output_buffer +
|
||||
cu_token_num_per_group_buffer[curr_expert_id] * input_size_2;
|
||||
float* __restrict__ curr_w2_gemm_output_buffer =
|
||||
w2_gemm_output_buffer +
|
||||
cu_token_num_per_group_buffer[curr_expert_id] * output_size_2 +
|
||||
curr_output_group_id * w2_n_tile_size;
|
||||
scalar_t* __restrict__ w2_weight_ptr =
|
||||
w2 + curr_expert_id * output_size_2 * input_size_2 +
|
||||
curr_output_group_id * w2_n_tile_size * input_size_2;
|
||||
scalar_t* __restrict__ w2_bias_ptr = nullptr;
|
||||
if (w2_bias != nullptr) {
|
||||
w2_bias_ptr = w2_bias + curr_expert_id * output_size_2 +
|
||||
curr_output_group_id * w2_n_tile_size;
|
||||
}
|
||||
|
||||
for (int32_t token_idx = 0; token_idx < curr_token_num;
|
||||
token_idx += gemm_m_tile_size) {
|
||||
const int32_t actual_token_num =
|
||||
std::min(gemm_m_tile_size, curr_token_num - token_idx);
|
||||
|
||||
scalar_t* __restrict__ w2_weight_ptr_iter = w2_weight_ptr;
|
||||
scalar_t* __restrict__ w2_bias_ptr_iter = w2_bias_ptr;
|
||||
float* __restrict__ curr_w2_gemm_output_buffer_iter =
|
||||
curr_w2_gemm_output_buffer;
|
||||
for (int32_t i = 0; i < actual_n_tile_size; i += gemm_n_tile_size) {
|
||||
gemm.gemm(curr_w13_gemm_output_buffer, w2_weight_ptr_iter,
|
||||
curr_w2_gemm_output_buffer_iter, actual_token_num,
|
||||
input_size_2, input_size_2, w2_n_group_stride,
|
||||
output_size_2, false);
|
||||
|
||||
if (w2_bias != nullptr) {
|
||||
cpu_micro_gemm::add_bias_epilogue<gemm_n_tile_size>(
|
||||
curr_w2_gemm_output_buffer_iter,
|
||||
curr_w2_gemm_output_buffer_iter, w2_bias_ptr_iter,
|
||||
actual_token_num, output_size_2, output_size_2);
|
||||
w2_bias_ptr_iter += gemm_n_tile_size;
|
||||
}
|
||||
|
||||
w2_weight_ptr_iter += w2_n_tile_stride;
|
||||
curr_w2_gemm_output_buffer_iter += gemm_n_tile_size;
|
||||
}
|
||||
|
||||
// update
|
||||
curr_w13_gemm_output_buffer += gemm_m_tile_size * input_size_2;
|
||||
curr_w2_gemm_output_buffer += gemm_m_tile_size * output_size_2;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// weighted sum
|
||||
{
|
||||
alignas(64) cpu_utils::Counter counter;
|
||||
cpu_utils::Counter* counter_ptr = &counter;
|
||||
|
||||
#pragma omp parallel for schedule(static, 1)
|
||||
for (int32_t thread_id = 0; thread_id < thread_num; ++thread_id) {
|
||||
const int32_t task_num = token_num;
|
||||
uint8_t* __restrict__ thread_buffer =
|
||||
thread_buffer_start + thread_id * ws_thread_buffer_offset;
|
||||
float* __restrict__ ws_output_buffer =
|
||||
reinterpret_cast<float*>(thread_buffer + ws_output_buffer_offset);
|
||||
float* __restrict__ w2_gemm_output_buffer = reinterpret_cast<float*>(
|
||||
common_buffer_start + w2_gemm_output_buffer_offset);
|
||||
|
||||
for (;;) {
|
||||
int32_t task_id = counter_ptr->acquire_counter();
|
||||
if (task_id >= task_num) {
|
||||
break;
|
||||
}
|
||||
|
||||
int32_t token_id = task_id;
|
||||
int32_t* __restrict__ curr_expand_token_id_index_buffer =
|
||||
expand_token_id_index_buffer + token_id * topk_num;
|
||||
float* __restrict__ curr_weight = topk_weights + token_id * topk_num;
|
||||
scalar_t* __restrict__ curr_output_buffer =
|
||||
output + token_id * output_size_2;
|
||||
|
||||
if (topk_num > 1) {
|
||||
{
|
||||
int32_t w2_output_idx = curr_expand_token_id_index_buffer[0];
|
||||
float* __restrict__ w2_output_iter =
|
||||
w2_gemm_output_buffer + w2_output_idx * output_size_2;
|
||||
float* __restrict__ ws_output_buffer_iter = ws_output_buffer;
|
||||
vec_op::FP32Vec16 weight_vec(curr_weight[0]);
|
||||
for (int32_t i = 0; i < output_size_2; i += 16) {
|
||||
vec_op::FP32Vec16 vec(w2_output_iter);
|
||||
vec = vec * weight_vec;
|
||||
vec.save(ws_output_buffer_iter);
|
||||
|
||||
// update
|
||||
w2_output_iter += 16;
|
||||
ws_output_buffer_iter += 16;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
for (int32_t idx = 1; idx < topk_num - 1; ++idx) {
|
||||
int32_t w2_output_idx = curr_expand_token_id_index_buffer[idx];
|
||||
float* __restrict__ w2_output_iter =
|
||||
w2_gemm_output_buffer + w2_output_idx * output_size_2;
|
||||
float* __restrict__ ws_output_buffer_iter = ws_output_buffer;
|
||||
vec_op::FP32Vec16 weight_vec(curr_weight[idx]);
|
||||
for (int32_t i = 0; i < output_size_2; i += 16) {
|
||||
vec_op::FP32Vec16 vec(w2_output_iter);
|
||||
vec_op::FP32Vec16 sum(ws_output_buffer_iter);
|
||||
sum = sum + vec * weight_vec;
|
||||
sum.save(ws_output_buffer_iter);
|
||||
|
||||
// update
|
||||
w2_output_iter += 16;
|
||||
ws_output_buffer_iter += 16;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
int32_t idx = topk_num - 1;
|
||||
int32_t w2_output_idx = curr_expand_token_id_index_buffer[idx];
|
||||
float* __restrict__ w2_output_iter =
|
||||
w2_gemm_output_buffer + w2_output_idx * output_size_2;
|
||||
float* __restrict__ ws_output_buffer_iter = ws_output_buffer;
|
||||
scalar_t* __restrict__ curr_output_buffer_iter = curr_output_buffer;
|
||||
vec_op::FP32Vec16 weight_vec(curr_weight[idx]);
|
||||
for (int32_t i = 0; i < output_size_2; i += 16) {
|
||||
vec_op::FP32Vec16 vec(w2_output_iter);
|
||||
vec_op::FP32Vec16 sum(ws_output_buffer_iter);
|
||||
sum = sum + vec * weight_vec;
|
||||
scalar_vec_t out_vec(sum);
|
||||
out_vec.save(curr_output_buffer_iter);
|
||||
|
||||
// update
|
||||
w2_output_iter += 16;
|
||||
ws_output_buffer_iter += 16;
|
||||
curr_output_buffer_iter += 16;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
int32_t w2_output_idx = curr_expand_token_id_index_buffer[0];
|
||||
float* __restrict__ w2_output_iter =
|
||||
w2_gemm_output_buffer + w2_output_idx * output_size_2;
|
||||
scalar_t* __restrict__ curr_output_buffer_iter = curr_output_buffer;
|
||||
vec_op::FP32Vec16 weight_vec(curr_weight[0]);
|
||||
for (int32_t i = 0; i < output_size_2; i += 16) {
|
||||
vec_op::FP32Vec16 vec(w2_output_iter);
|
||||
vec = vec * weight_vec;
|
||||
scalar_vec_t out_vec(vec);
|
||||
out_vec.save(curr_output_buffer_iter);
|
||||
|
||||
// update
|
||||
w2_output_iter += 16;
|
||||
curr_output_buffer_iter += 16;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void prepack_moe_weight(
|
||||
const torch::Tensor& weight, // [expert_num, output_size, input_size]
|
||||
torch::Tensor& packed_weight, const std::string& isa) {
|
||||
TORCH_CHECK(weight.is_contiguous());
|
||||
const int32_t expert_num = weight.size(0);
|
||||
const int32_t output_size = weight.size(1);
|
||||
const int32_t input_size = weight.size(2);
|
||||
TORCH_CHECK_EQ(output_size % 32, 0);
|
||||
const int64_t expert_stride = weight.stride(0);
|
||||
cpu_utils::ISA isa_type = cpu_utils::get_isa(isa);
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
weight.scalar_type(), "prepack_moe_weight", [&]() {
|
||||
CPU_ISA_DISPATCH_IMPL(isa_type, [&]() {
|
||||
scalar_t* weight_ptr = weight.data_ptr<scalar_t>();
|
||||
scalar_t* packed_weight_ptr = packed_weight.data_ptr<scalar_t>();
|
||||
prepack_moe_weight_impl<scalar_t, gemm_t>(
|
||||
weight_ptr, packed_weight_ptr, expert_num, output_size,
|
||||
input_size, expert_stride);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
void cpu_fused_moe(
|
||||
torch::Tensor& output, // [token_num, output_size_2]
|
||||
const torch::Tensor& input, // [token_num, input_size_13]
|
||||
const torch::Tensor&
|
||||
w13, // [expert_num, output_size_13, input_size_13], packed
|
||||
const torch::Tensor&
|
||||
w2, // [expert_num, output_size_2, input_size_2], packed
|
||||
const std::optional<torch::Tensor>&
|
||||
w13_bias, // [expert_num, output_size_13]
|
||||
const std::optional<torch::Tensor>& w2_bias, // [expert_num, output_size_2]
|
||||
const torch::Tensor& topk_weights, // [token_num, k], float32
|
||||
const torch::Tensor& topk_id, // [token_num, k], int32
|
||||
const std::string& act, const std::string& isa) {
|
||||
const int32_t token_num = input.size(0);
|
||||
const int32_t input_size_13 = input.size(1);
|
||||
const int64_t input_stride = input.stride(0);
|
||||
TORCH_CHECK_EQ(input_stride, input_size_13);
|
||||
const int32_t expert_num = w13.size(0);
|
||||
const int32_t output_size_13 = w13.size(1);
|
||||
const int32_t input_size_2 = w2.size(2);
|
||||
const int32_t output_size_2 = w2.size(1);
|
||||
const int32_t topk_num = topk_id.size(1);
|
||||
const FusedMOEAct act_type = get_act_type(act);
|
||||
cpu_utils::ISA isa_type = cpu_utils::get_isa(isa);
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(w13.scalar_type(), "cpu_fused_moe", [&]() {
|
||||
CPU_ISA_DISPATCH_IMPL(isa_type, [&]() {
|
||||
fused_moe_impl<scalar_t, scalar_t, gemm_t>(
|
||||
output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
|
||||
w13.data_ptr<scalar_t>(), w2.data_ptr<scalar_t>(),
|
||||
w13_bias.has_value() ? w13_bias->data_ptr<scalar_t>() : nullptr,
|
||||
w2_bias.has_value() ? w2_bias->data_ptr<scalar_t>() : nullptr,
|
||||
topk_weights.data_ptr<float>(), topk_id.data_ptr<int32_t>(), act_type,
|
||||
token_num, expert_num, topk_num, input_size_13, output_size_13,
|
||||
input_size_2, output_size_2);
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -352,10 +352,6 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||
explicit FP32Vec16(bool, void* ptr)
|
||||
: reg((__m512)_mm512_stream_load_si512(ptr)) {}
|
||||
|
||||
// strided load
|
||||
explicit FP32Vec16(const float* ptr, INT32Vec16 idx)
|
||||
: reg(_mm512_i32gather_ps(idx.reg, ptr, 4)) {}
|
||||
|
||||
explicit FP32Vec16(__m512 data) : reg(data) {}
|
||||
|
||||
// de-pack 4 bit values
|
||||
@@ -412,10 +408,6 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||
return FP32Vec16(_mm512_sub_ps(reg, b.reg));
|
||||
}
|
||||
|
||||
FP32Vec16 operator-() const {
|
||||
return FP32Vec16(_mm512_xor_ps(reg, _mm512_set1_ps(-0.0f)));
|
||||
}
|
||||
|
||||
FP32Vec16 operator/(const FP32Vec16& b) const {
|
||||
return FP32Vec16(_mm512_div_ps(reg, b.reg));
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
#include "cpu/cpu_types.hpp"
|
||||
#include "cpu/utils.hpp"
|
||||
#include "cpu_types.hpp"
|
||||
#include "scratchpad_manager.h"
|
||||
#include "utils.hpp"
|
||||
|
||||
#ifdef CPU_CAPABILITY_AMXBF16
|
||||
#include "cpu/micro_gemm/cpu_micro_gemm_amx.hpp"
|
||||
@@ -157,7 +158,7 @@ void cpu_gemm_wna16_impl(
|
||||
// a simple schedule policy, just to hold more B tiles in L2 and make sure
|
||||
// each thread has tasks
|
||||
const int32_t n_partition_size = [&]() {
|
||||
const int64_t cache_size = cpu_utils::get_available_l2_size();
|
||||
const int64_t cache_size = cpu_utils::get_l2_size();
|
||||
int64_t ps_cache_limit = cache_size / (k_size * sizeof(scalar_t));
|
||||
int64_t ps_thread_limit = n_size / thread_num;
|
||||
ps_cache_limit =
|
||||
@@ -178,8 +179,8 @@ void cpu_gemm_wna16_impl(
|
||||
const int64_t b_buffer_offset = 0;
|
||||
const int64_t c_buffer_offset = b_buffer_size;
|
||||
const int64_t buffer_size = b_buffer_size + c_buffer_size;
|
||||
cpu_utils::ScratchPadManager::get_scratchpad_manager()->realloc(buffer_size *
|
||||
thread_num);
|
||||
DNNLScratchPadManager::get_dnnl_scratchpad_manager()->realloc(buffer_size *
|
||||
thread_num);
|
||||
|
||||
alignas(64) cpu_utils::Counter counter;
|
||||
cpu_utils::Counter* counter_ptr = &counter;
|
||||
@@ -189,10 +190,9 @@ void cpu_gemm_wna16_impl(
|
||||
scalar_t* __restrict__ b_buffer = nullptr;
|
||||
float* __restrict__ c_buffer = nullptr;
|
||||
{
|
||||
uint8_t* buffer_ptr =
|
||||
cpu_utils::ScratchPadManager::get_scratchpad_manager()
|
||||
->get_data<uint8_t>() +
|
||||
thread_id * buffer_size;
|
||||
uint8_t* buffer_ptr = DNNLScratchPadManager::get_dnnl_scratchpad_manager()
|
||||
->get_data<uint8_t>() +
|
||||
thread_id * buffer_size;
|
||||
b_buffer = reinterpret_cast<scalar_t*>(buffer_ptr + b_buffer_offset);
|
||||
c_buffer = reinterpret_cast<float*>(buffer_ptr + c_buffer_offset);
|
||||
}
|
||||
|
||||
@@ -4,8 +4,8 @@
|
||||
#include "common/memory_desc.hpp"
|
||||
#include "common/memory.hpp"
|
||||
|
||||
#include "cpu/utils.hpp"
|
||||
#include "cpu/dnnl_helper.h"
|
||||
#include "dnnl_helper.h"
|
||||
#include "scratchpad_manager.h"
|
||||
|
||||
static dnnl::engine& default_engine() {
|
||||
static dnnl::engine engine(dnnl::engine::kind::cpu, 0);
|
||||
@@ -274,7 +274,7 @@ void W8A8MatMulPrimitiveHandler::execute(ExecArgs& args) {
|
||||
|
||||
auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(5);
|
||||
scratchpad_storage->set_data_handle(
|
||||
cpu_utils::ScratchPadManager::get_scratchpad_manager()->get_data<void>());
|
||||
DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<void>());
|
||||
|
||||
matmul.execute(default_stream(), memory_cache_);
|
||||
default_stream().wait();
|
||||
@@ -294,7 +294,7 @@ dnnl::matmul W8A8MatMulPrimitiveHandler::get_matmul_cache(
|
||||
|
||||
return m_size_cache_->get_or_create(key, [&]() {
|
||||
dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
|
||||
auto manager = cpu_utils::ScratchPadManager::get_scratchpad_manager();
|
||||
auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager();
|
||||
manager->realloc(desc.scratchpad_desc().get_size());
|
||||
return dnnl::matmul(desc);
|
||||
});
|
||||
@@ -470,7 +470,7 @@ void MatMulPrimitiveHandler::execute(ExecArgs& args) {
|
||||
|
||||
auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(3);
|
||||
scratchpad_storage->set_data_handle(
|
||||
cpu_utils::ScratchPadManager::get_scratchpad_manager()->get_data<void>());
|
||||
DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<void>());
|
||||
|
||||
matmul.execute(default_stream(), memory_cache_);
|
||||
default_stream().wait();
|
||||
@@ -486,7 +486,7 @@ dnnl::matmul MatMulPrimitiveHandler::get_matmul_cache(
|
||||
}
|
||||
return m_size_cache_->get_or_create(key, [&]() {
|
||||
dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
|
||||
auto manager = cpu_utils::ScratchPadManager::get_scratchpad_manager();
|
||||
auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager();
|
||||
manager->realloc(desc.scratchpad_desc().get_size());
|
||||
return dnnl::matmul(desc);
|
||||
});
|
||||
|
||||
@@ -235,39 +235,6 @@ class MicroGemm<cpu_utils::ISA::AMX, scalar_t> {
|
||||
}
|
||||
}
|
||||
|
||||
static void pack_weight(const scalar_t* __restrict__ weight,
|
||||
scalar_t* __restrict__ packed_weight,
|
||||
const int32_t output_size, const int32_t input_size) {
|
||||
constexpr int32_t elem_num_per_group = 4 / sizeof(scalar_t);
|
||||
TORCH_CHECK_EQ(output_size % 16, 0);
|
||||
TORCH_CHECK_EQ(input_size % (16 * elem_num_per_group), 0);
|
||||
|
||||
const int32_t output_group_num = output_size / 16;
|
||||
const int32_t input_32b_num = input_size / elem_num_per_group;
|
||||
for (int32_t output_group_idx = 0; output_group_idx < output_group_num;
|
||||
++output_group_idx) {
|
||||
const int32_t* __restrict__ weight_32b =
|
||||
reinterpret_cast<const int32_t*>(weight);
|
||||
int32_t* __restrict__ packed_weight_32b =
|
||||
reinterpret_cast<int32_t*>(packed_weight);
|
||||
for (int32_t output_idx = 0; output_idx < 16; ++output_idx) {
|
||||
for (int32_t weight_offset = 0, packed_offset = 0;
|
||||
weight_offset < input_32b_num;
|
||||
++weight_offset, packed_offset += 16) {
|
||||
packed_weight_32b[packed_offset] = weight_32b[weight_offset];
|
||||
}
|
||||
|
||||
// update
|
||||
weight_32b += input_32b_num;
|
||||
packed_weight_32b += 1;
|
||||
}
|
||||
|
||||
// update
|
||||
weight += 16 * input_size;
|
||||
packed_weight += 16 * input_size;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
alignas(64) __tilecfg amx_tile_config_;
|
||||
int32_t curr_m_;
|
||||
|
||||
@@ -13,9 +13,6 @@ namespace cpu_micro_gemm {
|
||||
#define CPU_MICRO_GEMM_PARAMS \
|
||||
a_ptr, b_ptr, c_ptr, m, k, lda, b_n_group_stride, ldc, accum_c
|
||||
|
||||
// Note: weights for MicroGemm should be packed as (output_size / 16) contiguous
|
||||
// blocks, means the logical shape of blocks is [16, input_size]. And the actual
|
||||
// layout of blocks can be ISA-specific.
|
||||
template <cpu_utils::ISA isa, typename scalar_t>
|
||||
class MicroGemm {
|
||||
public:
|
||||
@@ -89,41 +86,6 @@ FORCE_INLINE void bias_epilogue(float* __restrict__ c_ptr,
|
||||
curr_d += ldd;
|
||||
}
|
||||
}
|
||||
|
||||
template <int32_t n_size, typename scalar_t>
|
||||
FORCE_INLINE void add_bias_epilogue(float* c_ptr, float* d_ptr,
|
||||
scalar_t* __restrict__ bias_ptr,
|
||||
const int32_t m, const int64_t ldc,
|
||||
const int64_t ldd) {
|
||||
using scalar_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
|
||||
static_assert(n_size % 16 == 0);
|
||||
constexpr int32_t n_group_num = n_size / 16;
|
||||
static_assert(n_group_num <= 16);
|
||||
|
||||
vec_op::FP32Vec16 bias_vecs[n_group_num];
|
||||
scalar_t* __restrict__ curr_bias = bias_ptr;
|
||||
vec_op::unroll_loop<int32_t, n_group_num>([&](int32_t i) {
|
||||
scalar_vec_t vec(curr_bias);
|
||||
bias_vecs[i] = vec_op::FP32Vec16(vec);
|
||||
curr_bias += 16;
|
||||
});
|
||||
|
||||
float* curr_c = c_ptr;
|
||||
float* curr_d = d_ptr;
|
||||
for (int32_t i = 0; i < m; ++i) {
|
||||
float* curr_c_iter = curr_c;
|
||||
float* curr_d_iter = curr_d;
|
||||
vec_op::unroll_loop<int32_t, n_group_num>([&](int32_t n_g_idx) {
|
||||
vec_op::FP32Vec16 c_vec_fp32(curr_c_iter);
|
||||
c_vec_fp32 = c_vec_fp32 + bias_vecs[n_g_idx];
|
||||
c_vec_fp32.save(curr_d_iter);
|
||||
curr_c_iter += 16;
|
||||
curr_d_iter += 16;
|
||||
});
|
||||
curr_c += ldc;
|
||||
curr_d += ldd;
|
||||
}
|
||||
}
|
||||
} // namespace cpu_micro_gemm
|
||||
|
||||
#endif
|
||||
|
||||
@@ -109,25 +109,6 @@ class MicroGemm<cpu_utils::ISA::VEC, scalar_t> {
|
||||
void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
|
||||
TileGemm82<scalar_t>::gemm(CPU_MICRO_GEMM_PARAMS);
|
||||
}
|
||||
|
||||
// Note: pack contiguous weight [output_size, input_size] as contiguous
|
||||
// packed weight [output_size / 16, input_size, 16]
|
||||
static void pack_weight(const scalar_t* __restrict__ weight,
|
||||
scalar_t* __restrict__ packed_weight,
|
||||
const int32_t output_size, const int32_t input_size) {
|
||||
TORCH_CHECK_EQ(output_size % 16, 0);
|
||||
for (int32_t o_idx = 0; o_idx < output_size; ++o_idx) {
|
||||
const scalar_t* __restrict__ curr_weight = weight + o_idx * input_size;
|
||||
scalar_t* __restrict__ curr_packed_weight =
|
||||
packed_weight + (o_idx / 16) * (16 * input_size) + o_idx % 16;
|
||||
for (int32_t i_idx = 0; i_idx < input_size; ++i_idx) {
|
||||
*curr_packed_weight = *curr_weight;
|
||||
|
||||
curr_packed_weight += 16;
|
||||
++curr_weight;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
} // namespace cpu_micro_gemm
|
||||
|
||||
|
||||
23
csrc/cpu/scratchpad_manager.cpp
Normal file
23
csrc/cpu/scratchpad_manager.cpp
Normal file
@@ -0,0 +1,23 @@
|
||||
#include <cstdlib>
|
||||
|
||||
#include "scratchpad_manager.h"
|
||||
|
||||
DNNLScratchPadManager::DNNLScratchPadManager() : size_(0), ptr_(nullptr) {
|
||||
this->realloc(allocation_unit * 128);
|
||||
}
|
||||
|
||||
void DNNLScratchPadManager::realloc(size_t new_size) {
|
||||
new_size = round(new_size);
|
||||
if (new_size > size_) {
|
||||
if (ptr_ != nullptr) {
|
||||
std::free(ptr_);
|
||||
}
|
||||
ptr_ = std::aligned_alloc(64, new_size);
|
||||
size_ = new_size;
|
||||
}
|
||||
}
|
||||
|
||||
DNNLScratchPadManager* DNNLScratchPadManager::get_dnnl_scratchpad_manager() {
|
||||
static DNNLScratchPadManager manager;
|
||||
return &manager;
|
||||
}
|
||||
31
csrc/cpu/scratchpad_manager.h
Normal file
31
csrc/cpu/scratchpad_manager.h
Normal file
@@ -0,0 +1,31 @@
|
||||
#ifndef SCRATCHPAD_MANAGER_H
|
||||
#define SCRATCHPAD_MANAGER_H
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdio>
|
||||
|
||||
class DNNLScratchPadManager {
|
||||
public:
|
||||
static constexpr size_t allocation_unit = 4 * 1024; // 4KB
|
||||
|
||||
static DNNLScratchPadManager* get_dnnl_scratchpad_manager();
|
||||
|
||||
DNNLScratchPadManager();
|
||||
|
||||
template <typename T>
|
||||
T* get_data() {
|
||||
return reinterpret_cast<T*>(ptr_);
|
||||
}
|
||||
|
||||
static size_t round(size_t size) {
|
||||
return ((size + allocation_unit - 1) / allocation_unit) * allocation_unit;
|
||||
}
|
||||
|
||||
void realloc(size_t new_size);
|
||||
|
||||
private:
|
||||
size_t size_;
|
||||
void* ptr_;
|
||||
};
|
||||
|
||||
#endif
|
||||
@@ -110,17 +110,6 @@ void cpu_gemm_wna16(const torch::Tensor& input, const torch::Tensor& q_weight,
|
||||
const std::optional<torch::Tensor>& bias,
|
||||
const int64_t pack_factor, const std::string& isa_hint);
|
||||
|
||||
void prepack_moe_weight(const torch::Tensor& weight,
|
||||
torch::Tensor& packed_weight, const std::string& isa);
|
||||
|
||||
void cpu_fused_moe(torch::Tensor& output, const torch::Tensor& input,
|
||||
const torch::Tensor& w13, const torch::Tensor& w2,
|
||||
const std::optional<torch::Tensor>& w13_bias,
|
||||
const std::optional<torch::Tensor>& w2_bias,
|
||||
const torch::Tensor& topk_weights,
|
||||
const torch::Tensor& topk_id, const std::string& act,
|
||||
const std::string& isa);
|
||||
|
||||
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
// vLLM custom ops
|
||||
|
||||
@@ -307,19 +296,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
"pack_factor, str isa_hint) -> ()");
|
||||
ops.impl("cpu_gemm_wna16", torch::kCPU, &cpu_gemm_wna16);
|
||||
#endif
|
||||
|
||||
// fused moe
|
||||
#if defined(__AVX512F__)
|
||||
ops.def(
|
||||
"prepack_moe_weight(Tensor weight, Tensor(a1!) packed_weight, str isa) "
|
||||
"-> ()");
|
||||
ops.impl("prepack_moe_weight", torch::kCPU, &prepack_moe_weight);
|
||||
ops.def(
|
||||
"cpu_fused_moe(Tensor(a0!) output, Tensor input, Tensor w13, Tensor w2, "
|
||||
"Tensor? w13_bias, Tensor? w2_bias, Tensor topk_weights, Tensor topk_id, "
|
||||
"str act, str isa) -> ()");
|
||||
ops.impl("cpu_fused_moe", torch::kCPU, &cpu_fused_moe);
|
||||
#endif
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
#define gettid() syscall(SYS_gettid)
|
||||
#endif
|
||||
|
||||
#include "cpu/utils.hpp"
|
||||
#include "cpu_types.hpp"
|
||||
|
||||
#ifdef VLLM_NUMA_DISABLED
|
||||
std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
||||
@@ -138,26 +138,4 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
||||
|
||||
return ss.str();
|
||||
}
|
||||
#endif // VLLM_NUMA_DISABLED
|
||||
|
||||
namespace cpu_utils {
|
||||
ScratchPadManager::ScratchPadManager() : size_(0), ptr_(nullptr) {
|
||||
this->realloc(allocation_unit * 128);
|
||||
}
|
||||
|
||||
void ScratchPadManager::realloc(size_t new_size) {
|
||||
new_size = round(new_size);
|
||||
if (new_size > size_) {
|
||||
if (ptr_ != nullptr) {
|
||||
std::free(ptr_);
|
||||
}
|
||||
ptr_ = std::aligned_alloc(64, new_size);
|
||||
size_ = new_size;
|
||||
}
|
||||
}
|
||||
|
||||
ScratchPadManager* ScratchPadManager::get_scratchpad_manager() {
|
||||
static ScratchPadManager manager;
|
||||
return &manager;
|
||||
}
|
||||
} // namespace cpu_utils
|
||||
#endif
|
||||
|
||||
@@ -2,24 +2,19 @@
|
||||
#define UTILS_HPP
|
||||
|
||||
#include <atomic>
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <unistd.h>
|
||||
#include <ATen/cpu/Utils.h>
|
||||
|
||||
#include "cpu/cpu_types.hpp"
|
||||
#if defined(__APPLE__)
|
||||
#include <sys/sysctl.h>
|
||||
#endif
|
||||
|
||||
#include "cpu_types.hpp"
|
||||
|
||||
namespace cpu_utils {
|
||||
enum class ISA { AMX, VEC };
|
||||
|
||||
inline ISA get_isa(const std::string& isa) {
|
||||
if (isa == "amx") {
|
||||
return ISA::AMX;
|
||||
} else if (isa == "vec") {
|
||||
return ISA::VEC;
|
||||
} else {
|
||||
TORCH_CHECK(false, "Invalid isa type: " + isa);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
struct VecTypeTrait {
|
||||
using vec_t = void;
|
||||
@@ -53,66 +48,26 @@ struct Counter {
|
||||
int64_t acquire_counter() { return counter++; }
|
||||
};
|
||||
|
||||
inline int64_t get_available_l2_size() {
|
||||
inline int64_t get_l2_size() {
|
||||
static int64_t size = []() {
|
||||
const uint32_t l2_cache_size = at::cpu::L2_cache_size();
|
||||
#if defined(__APPLE__)
|
||||
// macOS doesn't have _SC_LEVEL2_CACHE_SIZE. Use sysctlbyname.
|
||||
int64_t l2_cache_size = 0;
|
||||
size_t len = sizeof(l2_cache_size);
|
||||
if (sysctlbyname("hw.l2cachesize", &l2_cache_size, &len, NULL, 0) == 0 &&
|
||||
l2_cache_size > 0) {
|
||||
return l2_cache_size >> 1; // use 50% of L2 cache
|
||||
}
|
||||
// Fallback if sysctlbyname fails
|
||||
return 128LL * 1024 >> 1; // use 50% of 128KB
|
||||
#else
|
||||
long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE);
|
||||
assert(l2_cache_size != -1);
|
||||
return l2_cache_size >> 1; // use 50% of L2 cache
|
||||
#endif
|
||||
}();
|
||||
return size;
|
||||
}
|
||||
|
||||
template <int32_t alignment_v, typename T>
|
||||
inline T round_up(T size) {
|
||||
T alignment = alignment_v;
|
||||
return (((size + alignment - 1) / alignment) * alignment);
|
||||
}
|
||||
|
||||
template <int32_t alignment_v, typename T>
|
||||
inline T round_down(T size) {
|
||||
T alignment = alignment_v;
|
||||
return (size / alignment) * alignment;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void print_logits(const char* name, T* ptr, int32_t row, int32_t col,
|
||||
int32_t stride) {
|
||||
std::stringstream ss;
|
||||
ss << std::fixed << std::setprecision(5) << name << ": [\n";
|
||||
auto* curr_logits_buffer = ptr;
|
||||
for (int32_t m = 0; m < row; ++m) {
|
||||
for (int32_t n = 0; n < col; ++n) {
|
||||
ss << curr_logits_buffer[n] << ", ";
|
||||
}
|
||||
ss << "\n";
|
||||
curr_logits_buffer += stride;
|
||||
}
|
||||
ss << "]\n";
|
||||
std::printf("%s", ss.str().c_str());
|
||||
}
|
||||
|
||||
class ScratchPadManager {
|
||||
public:
|
||||
static constexpr size_t allocation_unit = 4 * 1024; // 4KB
|
||||
|
||||
static ScratchPadManager* get_scratchpad_manager();
|
||||
|
||||
ScratchPadManager();
|
||||
|
||||
template <typename T>
|
||||
T* get_data() {
|
||||
return reinterpret_cast<T*>(ptr_);
|
||||
}
|
||||
|
||||
static size_t round(size_t size) {
|
||||
return ((size + allocation_unit - 1) / allocation_unit) * allocation_unit;
|
||||
}
|
||||
|
||||
void realloc(size_t new_size);
|
||||
|
||||
private:
|
||||
size_t size_;
|
||||
void* ptr_;
|
||||
};
|
||||
} // namespace cpu_utils
|
||||
|
||||
#endif
|
||||
|
||||
@@ -107,16 +107,6 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
|
||||
prop.location.id = device;
|
||||
prop.allocFlags.compressionType = CU_MEM_ALLOCATION_COMP_NONE;
|
||||
|
||||
#ifndef USE_ROCM
|
||||
int flag = 0;
|
||||
CUDA_CHECK(cuDeviceGetAttribute(
|
||||
&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
|
||||
device));
|
||||
if (flag) { // support GPUDirect RDMA if possible
|
||||
prop.allocFlags.gpuDirectRDMACapable = 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef USE_ROCM
|
||||
// Allocate memory using cuMemCreate
|
||||
CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0));
|
||||
|
||||
@@ -118,24 +118,6 @@
|
||||
} \
|
||||
}
|
||||
|
||||
#define VLLM_DISPATCH_BOOL(expr, const_expr, ...) \
|
||||
if (expr) { \
|
||||
constexpr bool const_expr = true; \
|
||||
__VA_ARGS__(); \
|
||||
} else { \
|
||||
constexpr bool const_expr = false; \
|
||||
__VA_ARGS__(); \
|
||||
}
|
||||
|
||||
#define VLLM_DISPATCH_GROUP_SIZE(group_size, const_group_size, ...) \
|
||||
if (group_size == 128) { \
|
||||
constexpr int const_group_size = 128; \
|
||||
__VA_ARGS__(); \
|
||||
} else if (group_size == 64) { \
|
||||
constexpr int const_group_size = 64; \
|
||||
__VA_ARGS__(); \
|
||||
}
|
||||
|
||||
#define VLLM_DISPATCH_RANK234(NUM_DIMS, ...) \
|
||||
switch (NUM_DIMS) { \
|
||||
case 2: { \
|
||||
|
||||
@@ -444,31 +444,23 @@ __device__ inline T apply_sigmoid(T val) {
|
||||
return cuda_cast<T, float>(sigmoid_accurate(f));
|
||||
}
|
||||
|
||||
template <ScoringFunc SF, typename T>
|
||||
__device__ inline T apply_scoring(T val) {
|
||||
if constexpr (SF == SCORING_NONE) {
|
||||
return val;
|
||||
} else if constexpr (SF == SCORING_SIGMOID) {
|
||||
return apply_sigmoid(val);
|
||||
} else {
|
||||
static_assert(SF == SCORING_NONE || SF == SCORING_SIGMOID,
|
||||
"Unsupported ScoringFunc in apply_scoring");
|
||||
return val;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, ScoringFunc SF>
|
||||
template <typename T>
|
||||
__device__ void topk_with_k2(T* output, T const* input, T const* bias,
|
||||
cg::thread_block_tile<32> const& tile,
|
||||
int32_t const lane_id,
|
||||
int const num_experts_per_group) {
|
||||
int const num_experts_per_group,
|
||||
int const scoring_func) {
|
||||
// Get the top2 per thread
|
||||
T largest = neg_inf<T>();
|
||||
T second_largest = neg_inf<T>();
|
||||
|
||||
if (num_experts_per_group > WARP_SIZE) {
|
||||
for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
|
||||
T value = apply_scoring<SF>(input[i]);
|
||||
T value = input[i];
|
||||
// Apply scoring function if needed
|
||||
if (scoring_func == SCORING_SIGMOID) {
|
||||
value = apply_sigmoid(value);
|
||||
}
|
||||
value = value + bias[i];
|
||||
|
||||
if (value > largest) {
|
||||
@@ -480,11 +472,17 @@ __device__ void topk_with_k2(T* output, T const* input, T const* bias,
|
||||
}
|
||||
} else {
|
||||
for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
|
||||
T value = apply_scoring<SF>(input[i]);
|
||||
T value = input[i];
|
||||
// Apply scoring function if needed
|
||||
if (scoring_func == SCORING_SIGMOID) {
|
||||
value = apply_sigmoid(value);
|
||||
}
|
||||
value = value + bias[i];
|
||||
largest = value;
|
||||
}
|
||||
}
|
||||
|
||||
__syncwarp(); // Ensure all threads have valid data before reduction
|
||||
// Get the top2 warpwise
|
||||
T max1 = cg::reduce(tile, largest, cg::greater<T>());
|
||||
|
||||
@@ -503,12 +501,13 @@ __device__ void topk_with_k2(T* output, T const* input, T const* bias,
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, ScoringFunc SF>
|
||||
template <typename T>
|
||||
__global__ void topk_with_k2_kernel(T* output, T* input, T const* bias,
|
||||
int64_t const num_tokens,
|
||||
int64_t const num_cases,
|
||||
int64_t const n_group,
|
||||
int64_t const num_experts_per_group) {
|
||||
int64_t const num_experts_per_group,
|
||||
int const scoring_func) {
|
||||
int32_t warp_id = threadIdx.x / WARP_SIZE;
|
||||
int32_t lane_id = threadIdx.x % WARP_SIZE;
|
||||
|
||||
@@ -526,21 +525,21 @@ __global__ void topk_with_k2_kernel(T* output, T* input, T const* bias,
|
||||
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
|
||||
asm volatile("griddepcontrol.wait;");
|
||||
#endif
|
||||
topk_with_k2<T, SF>(output, input, group_bias, tile, lane_id,
|
||||
num_experts_per_group);
|
||||
topk_with_k2(output, input, group_bias, tile, lane_id,
|
||||
num_experts_per_group, scoring_func);
|
||||
}
|
||||
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
|
||||
asm volatile("griddepcontrol.launch_dependents;");
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T, typename IdxT, ScoringFunc SF, int NGroup = -1>
|
||||
template <typename T, typename IdxT>
|
||||
__global__ void group_idx_and_topk_idx_kernel(
|
||||
T* scores, T const* group_scores, float* topk_values, IdxT* topk_indices,
|
||||
T const* bias, int64_t const num_tokens, int64_t const n_group,
|
||||
int64_t const topk_group, int64_t const topk, int64_t const num_experts,
|
||||
int64_t const num_experts_per_group, bool renormalize,
|
||||
double routed_scaling_factor) {
|
||||
double routed_scaling_factor, int scoring_func) {
|
||||
int32_t warp_id = threadIdx.x / WARP_SIZE;
|
||||
int32_t lane_id = threadIdx.x % WARP_SIZE;
|
||||
int32_t case_id =
|
||||
@@ -550,11 +549,6 @@ __global__ void group_idx_and_topk_idx_kernel(
|
||||
topk_values += case_id * topk;
|
||||
topk_indices += case_id * topk;
|
||||
|
||||
constexpr bool kUseStaticNGroup = (NGroup > 0);
|
||||
// use int32 to avoid implicit conversion
|
||||
int32_t const n_group_i32 =
|
||||
kUseStaticNGroup ? NGroup : static_cast<int32_t>(n_group);
|
||||
|
||||
int32_t align_num_experts_per_group =
|
||||
warp_topk::round_up_to_multiple_of<WARP_SIZE>(num_experts_per_group);
|
||||
|
||||
@@ -580,17 +574,17 @@ __global__ void group_idx_and_topk_idx_kernel(
|
||||
|
||||
if (case_id < num_tokens) {
|
||||
// calculate group_idx
|
||||
int32_t target_num_min =
|
||||
WARP_SIZE - n_group_i32 + static_cast<int32_t>(topk_group);
|
||||
int32_t target_num_min = WARP_SIZE - n_group + topk_group;
|
||||
// The check is necessary to avoid abnormal input
|
||||
if (lane_id < n_group_i32 && is_finite(group_scores[lane_id])) {
|
||||
if (lane_id < n_group && is_finite(group_scores[lane_id])) {
|
||||
value = group_scores[lane_id];
|
||||
}
|
||||
|
||||
int count_equal_to_top_value = WARP_SIZE - n_group_i32;
|
||||
int count_equal_to_top_value = WARP_SIZE - n_group;
|
||||
int pre_count_equal_to_top_value = 0;
|
||||
// Use loop to find the largset top_group
|
||||
while (count_equal_to_top_value < target_num_min) {
|
||||
__syncwarp(); // Ensure all threads have valid data before reduction
|
||||
topk_group_value = cg::reduce(tile, value, cg::greater<T>());
|
||||
if (value == topk_group_value) {
|
||||
value = neg_inf<T>();
|
||||
@@ -610,7 +604,7 @@ __global__ void group_idx_and_topk_idx_kernel(
|
||||
int count_equalto_topkth_group = 0;
|
||||
bool if_proceed_next_topk = topk_group_value != neg_inf<T>();
|
||||
if (case_id < num_tokens && if_proceed_next_topk) {
|
||||
auto process_group = [&](int i_group) {
|
||||
for (int i_group = 0; i_group < n_group; i_group++) {
|
||||
if ((group_scores[i_group] > topk_group_value) ||
|
||||
((group_scores[i_group] == topk_group_value) &&
|
||||
(count_equalto_topkth_group < num_equalto_topkth_group))) {
|
||||
@@ -619,10 +613,11 @@ __global__ void group_idx_and_topk_idx_kernel(
|
||||
i += WARP_SIZE) {
|
||||
T candidates = neg_inf<T>();
|
||||
if (i < num_experts_per_group) {
|
||||
// apply scoring function (if any) and add bias
|
||||
// Apply scoring function (if any) and add bias
|
||||
T input = scores[offset + i];
|
||||
if (is_finite(input)) {
|
||||
T score = apply_scoring<SF>(input);
|
||||
T score = (scoring_func == SCORING_SIGMOID) ? apply_sigmoid(input)
|
||||
: input;
|
||||
candidates = score + bias[offset + i];
|
||||
}
|
||||
}
|
||||
@@ -632,21 +627,12 @@ __global__ void group_idx_and_topk_idx_kernel(
|
||||
count_equalto_topkth_group++;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
if constexpr (kUseStaticNGroup) {
|
||||
#pragma unroll
|
||||
for (int i_group = 0; i_group < NGroup; ++i_group) {
|
||||
process_group(i_group);
|
||||
}
|
||||
} else {
|
||||
for (int i_group = 0; i_group < n_group_i32; ++i_group) {
|
||||
process_group(i_group);
|
||||
}
|
||||
}
|
||||
queue.done();
|
||||
__syncwarp();
|
||||
// Get the topk_idx
|
||||
queue.dumpIdx(s_topk_idx);
|
||||
__syncwarp();
|
||||
}
|
||||
|
||||
// Load the valid score value
|
||||
@@ -660,13 +646,12 @@ __global__ void group_idx_and_topk_idx_kernel(
|
||||
if (i < topk) {
|
||||
// Load the score value (without bias) for normalization
|
||||
T input = scores[s_topk_idx[i]];
|
||||
value = apply_scoring<SF>(input);
|
||||
value =
|
||||
(scoring_func == SCORING_SIGMOID) ? apply_sigmoid(input) : input;
|
||||
s_topk_value[i] = value;
|
||||
}
|
||||
if (renormalize) {
|
||||
topk_sum +=
|
||||
cg::reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
|
||||
}
|
||||
topk_sum +=
|
||||
cg::reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -674,13 +659,14 @@ __global__ void group_idx_and_topk_idx_kernel(
|
||||
|
||||
if (case_id < num_tokens) {
|
||||
if (if_proceed_next_topk) {
|
||||
float scale = routed_scaling_factor;
|
||||
if (renormalize) {
|
||||
scale /= topk_sum;
|
||||
}
|
||||
for (int i = lane_id; i < topk; i += WARP_SIZE) {
|
||||
float base = cuda_cast<float, T>(s_topk_value[i]);
|
||||
float value = base * scale;
|
||||
float value;
|
||||
if (renormalize) {
|
||||
value = cuda_cast<float, T>(s_topk_value[i]) / topk_sum *
|
||||
routed_scaling_factor;
|
||||
} else {
|
||||
value = cuda_cast<float, T>(s_topk_value[i]) * routed_scaling_factor;
|
||||
}
|
||||
topk_indices[i] = s_topk_idx[i];
|
||||
topk_values[i] = value;
|
||||
}
|
||||
@@ -698,45 +684,6 @@ __global__ void group_idx_and_topk_idx_kernel(
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T, typename IdxT, ScoringFunc SF>
|
||||
inline void launch_group_idx_and_topk_kernel(
|
||||
cudaLaunchConfig_t const& config, T* scores, T* group_scores,
|
||||
float* topk_values, IdxT* topk_indices, T const* bias,
|
||||
int64_t const num_tokens, int64_t const n_group, int64_t const topk_group,
|
||||
int64_t const topk, int64_t const num_experts,
|
||||
int64_t const num_experts_per_group, bool const renormalize,
|
||||
double const routed_scaling_factor) {
|
||||
auto launch = [&](auto* kernel_instance2) {
|
||||
cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores,
|
||||
topk_values, topk_indices, bias, num_tokens, n_group,
|
||||
topk_group, topk, num_experts, num_experts_per_group,
|
||||
renormalize, routed_scaling_factor);
|
||||
};
|
||||
|
||||
switch (n_group) {
|
||||
case 4: {
|
||||
launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 4>);
|
||||
break;
|
||||
}
|
||||
case 8: {
|
||||
launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 8>);
|
||||
break;
|
||||
}
|
||||
case 16: {
|
||||
launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 16>);
|
||||
break;
|
||||
}
|
||||
case 32: {
|
||||
launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 32>);
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF>);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename IdxT>
|
||||
void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
|
||||
IdxT* topk_indices, T const* bias, int64_t const num_tokens,
|
||||
@@ -747,6 +694,7 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
|
||||
cudaStream_t const stream = 0) {
|
||||
int64_t num_cases = num_tokens * n_group;
|
||||
int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;
|
||||
auto* kernel_instance1 = &topk_with_k2_kernel<T>;
|
||||
cudaLaunchConfig_t config;
|
||||
config.gridDim = topk_with_k2_num_blocks;
|
||||
config.blockDim = BLOCK_SIZE;
|
||||
@@ -757,33 +705,16 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
|
||||
attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
|
||||
config.numAttrs = 1;
|
||||
config.attrs = attrs;
|
||||
auto const sf = static_cast<ScoringFunc>(scoring_func);
|
||||
int64_t const num_experts_per_group = num_experts / n_group;
|
||||
auto launch_topk_with_k2 = [&](auto* kernel_instance1) {
|
||||
cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores, bias,
|
||||
num_tokens, num_cases, n_group, num_experts_per_group);
|
||||
};
|
||||
switch (sf) {
|
||||
case SCORING_NONE: {
|
||||
auto* kernel_instance1 = &topk_with_k2_kernel<T, SCORING_NONE>;
|
||||
launch_topk_with_k2(kernel_instance1);
|
||||
break;
|
||||
}
|
||||
case SCORING_SIGMOID: {
|
||||
auto* kernel_instance1 = &topk_with_k2_kernel<T, SCORING_SIGMOID>;
|
||||
launch_topk_with_k2(kernel_instance1);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
// should be guarded by higher level checks.
|
||||
TORCH_CHECK(false, "Unsupported scoring_func in invokeNoAuxTc");
|
||||
}
|
||||
cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores, bias,
|
||||
num_tokens, num_cases, n_group, num_experts / n_group,
|
||||
scoring_func);
|
||||
|
||||
int64_t topk_with_k_group_num_blocks =
|
||||
(num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1;
|
||||
size_t dynamic_smem_in_bytes =
|
||||
warp_topk::calc_smem_size_for_block_wide<T, int32_t>(NUM_WARPS_PER_BLOCK,
|
||||
topk);
|
||||
auto* kernel_instance2 = &group_idx_and_topk_idx_kernel<T, IdxT>;
|
||||
config.gridDim = topk_with_k_group_num_blocks;
|
||||
config.blockDim = BLOCK_SIZE;
|
||||
config.dynamicSmemBytes = dynamic_smem_in_bytes;
|
||||
@@ -792,24 +723,10 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
|
||||
attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
|
||||
config.numAttrs = 1;
|
||||
config.attrs = attrs;
|
||||
switch (sf) {
|
||||
case SCORING_NONE: {
|
||||
launch_group_idx_and_topk_kernel<T, IdxT, SCORING_NONE>(
|
||||
config, scores, group_scores, topk_values, topk_indices, bias,
|
||||
num_tokens, n_group, topk_group, topk, num_experts,
|
||||
num_experts_per_group, renormalize, routed_scaling_factor);
|
||||
break;
|
||||
}
|
||||
case SCORING_SIGMOID: {
|
||||
launch_group_idx_and_topk_kernel<T, IdxT, SCORING_SIGMOID>(
|
||||
config, scores, group_scores, topk_values, topk_indices, bias,
|
||||
num_tokens, n_group, topk_group, topk, num_experts,
|
||||
num_experts_per_group, renormalize, routed_scaling_factor);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
TORCH_CHECK(false, "Unsupported scoring_func in invokeNoAuxTc");
|
||||
}
|
||||
cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores,
|
||||
topk_values, topk_indices, bias, num_tokens, n_group,
|
||||
topk_group, topk, num_experts, num_experts / n_group,
|
||||
renormalize, routed_scaling_factor, scoring_func);
|
||||
}
|
||||
|
||||
#define INSTANTIATE_NOAUX_TC(T, IdxT) \
|
||||
|
||||
1
csrc/moe/marlin_moe_wna16/.gitignore
vendored
1
csrc/moe/marlin_moe_wna16/.gitignore
vendored
@@ -1,3 +1,2 @@
|
||||
sm*_kernel_*.cu
|
||||
kernel_selector.h
|
||||
kernel_*.cu
|
||||
|
||||
@@ -10,8 +10,6 @@ import jinja2
|
||||
|
||||
ARCHS = []
|
||||
SUPPORT_FP8 = False
|
||||
SUPPORT_SM75 = False
|
||||
SUPPORT_SM80 = False
|
||||
for arch in sys.argv[1].split(","):
|
||||
arch = arch[: arch.index(".") + 2].replace(".", "")
|
||||
arch = int(arch)
|
||||
@@ -21,10 +19,6 @@ for arch in sys.argv[1].split(","):
|
||||
# with FP16 MMA, so it cannot achieve any acceleration.
|
||||
if arch in [89, 120]:
|
||||
SUPPORT_FP8 = True
|
||||
if arch >= 80:
|
||||
SUPPORT_SM80 = True
|
||||
if arch == 75:
|
||||
SUPPORT_SM75 = True
|
||||
|
||||
FILE_HEAD_COMMENT = """
|
||||
// auto generated by generate_kernels.py
|
||||
@@ -163,7 +157,6 @@ def remove_old_kernels():
|
||||
|
||||
def generate_new_kernels():
|
||||
result_dict = {}
|
||||
sm_75_result_dict = {}
|
||||
|
||||
for quant_config in QUANT_CONFIGS:
|
||||
c_types = quant_config.get("c_type", ["kFloat16", "kBFloat16"])
|
||||
@@ -181,8 +174,6 @@ def generate_new_kernels():
|
||||
s_type = quant_config.get("s_type", c_type)
|
||||
if (a_type, b_type, c_type) not in result_dict:
|
||||
result_dict[(a_type, b_type, c_type)] = []
|
||||
if a_type in ["kFloat16", "kS8"] and c_type == "kFloat16":
|
||||
sm_75_result_dict[(a_type, b_type, c_type)] = []
|
||||
|
||||
for group_blocks, m_blocks, thread_configs in itertools.product(
|
||||
all_group_blocks, all_m_blocks, all_thread_configs
|
||||
@@ -206,89 +197,78 @@ def generate_new_kernels():
|
||||
"thread_k_blocks": thread_k // 16,
|
||||
"thread_n_blocks": thread_n // 16,
|
||||
"m_block_size_8": "true" if m_blocks == 0.5 else "false",
|
||||
"stages": 4,
|
||||
"stages": "pipe_stages",
|
||||
"group_blocks": group_blocks,
|
||||
"is_zp_float": "false",
|
||||
}
|
||||
|
||||
if SUPPORT_SM80:
|
||||
result_dict[(a_type, b_type, c_type)].append(config)
|
||||
if (a_type, b_type, c_type) in sm_75_result_dict and SUPPORT_SM75:
|
||||
config_sm75 = config.copy()
|
||||
config_sm75["stages"] = 2
|
||||
sm_75_result_dict[(a_type, b_type, c_type)].append(config_sm75)
|
||||
result_dict[(a_type, b_type, c_type)].append(config)
|
||||
|
||||
kernel_selector_str = FILE_HEAD_COMMENT
|
||||
|
||||
for result_dict_tmp in [result_dict, sm_75_result_dict]:
|
||||
for (a_type, b_type, c_type), config_list in result_dict_tmp.items():
|
||||
all_template_str_list = []
|
||||
if not config_list:
|
||||
continue
|
||||
for config in config_list:
|
||||
s_type = config["s_type"]
|
||||
template_str = jinja2.Template(TEMPLATE).render(
|
||||
for (a_type, b_type, c_type), config_list in result_dict.items():
|
||||
all_template_str_list = []
|
||||
for config in config_list:
|
||||
s_type = config["s_type"]
|
||||
template_str = jinja2.Template(TEMPLATE).render(
|
||||
a_type_id=f"vllm::{a_type}.id()",
|
||||
b_type_id=f"vllm::{b_type}.id()",
|
||||
c_type_id=f"vllm::{c_type}.id()",
|
||||
s_type_id=f"vllm::{s_type}.id()",
|
||||
**config,
|
||||
)
|
||||
all_template_str_list.append(template_str)
|
||||
|
||||
conditions = [
|
||||
f"a_type == vllm::{a_type}",
|
||||
f"b_type == vllm::{b_type}",
|
||||
f"c_type == vllm::{c_type}",
|
||||
f"s_type == vllm::{s_type}",
|
||||
f"threads == {config['threads']}",
|
||||
f"thread_m_blocks == {config['thread_m_blocks']}",
|
||||
f"thread_n_blocks == {config['thread_n_blocks']}",
|
||||
f"thread_k_blocks == {config['thread_k_blocks']}",
|
||||
f"m_block_size_8 == {config['m_block_size_8']}",
|
||||
f"group_blocks == {config['group_blocks']}",
|
||||
f"is_zp_float == {config['is_zp_float']}",
|
||||
]
|
||||
conditions = " && ".join(conditions)
|
||||
|
||||
if kernel_selector_str == FILE_HEAD_COMMENT:
|
||||
kernel_selector_str += f"if ({conditions})\n kernel = "
|
||||
else:
|
||||
kernel_selector_str += f"else if ({conditions})\n kernel = "
|
||||
|
||||
kernel_template2 = (
|
||||
"Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, "
|
||||
"{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, "
|
||||
"{{thread_n_blocks}}, {{thread_k_blocks}}, "
|
||||
"{{m_block_size_8}}, {{stages}}, {{group_blocks}}, "
|
||||
"{{is_zp_float}}>;"
|
||||
)
|
||||
|
||||
kernel_selector_str += (
|
||||
jinja2.Template(kernel_template2).render(
|
||||
a_type_id=f"vllm::{a_type}.id()",
|
||||
b_type_id=f"vllm::{b_type}.id()",
|
||||
c_type_id=f"vllm::{c_type}.id()",
|
||||
s_type_id=f"vllm::{s_type}.id()",
|
||||
**config,
|
||||
)
|
||||
all_template_str_list.append(template_str)
|
||||
+ "\n"
|
||||
)
|
||||
|
||||
conditions = [
|
||||
f"a_type == vllm::{a_type}",
|
||||
f"b_type == vllm::{b_type}",
|
||||
f"c_type == vllm::{c_type}",
|
||||
f"s_type == vllm::{s_type}",
|
||||
f"threads == {config['threads']}",
|
||||
f"thread_m_blocks == {config['thread_m_blocks']}",
|
||||
f"thread_n_blocks == {config['thread_n_blocks']}",
|
||||
f"thread_k_blocks == {config['thread_k_blocks']}",
|
||||
f"m_block_size_8 == {config['m_block_size_8']}",
|
||||
f"stages == {config['stages']}",
|
||||
f"group_blocks == {config['group_blocks']}",
|
||||
f"is_zp_float == {config['is_zp_float']}",
|
||||
]
|
||||
conditions = " && ".join(conditions)
|
||||
file_content = FILE_HEAD + "\n\n"
|
||||
file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
|
||||
if a_type == "kFE4M3fn":
|
||||
filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
|
||||
else:
|
||||
filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
|
||||
|
||||
if kernel_selector_str == FILE_HEAD_COMMENT:
|
||||
kernel_selector_str += f"if ({conditions})\n kernel = "
|
||||
else:
|
||||
kernel_selector_str += f"else if ({conditions})\n kernel = "
|
||||
filename = filename.lower()
|
||||
|
||||
kernel_template2 = (
|
||||
"Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, "
|
||||
"{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, "
|
||||
"{{thread_n_blocks}}, {{thread_k_blocks}}, "
|
||||
"{{m_block_size_8}}, {{stages}}, {{group_blocks}}, "
|
||||
"{{is_zp_float}}>;"
|
||||
)
|
||||
|
||||
kernel_selector_str += (
|
||||
jinja2.Template(kernel_template2).render(
|
||||
a_type_id=f"vllm::{a_type}.id()",
|
||||
b_type_id=f"vllm::{b_type}.id()",
|
||||
c_type_id=f"vllm::{c_type}.id()",
|
||||
s_type_id=f"vllm::{s_type}.id()",
|
||||
**config,
|
||||
)
|
||||
+ "\n"
|
||||
)
|
||||
|
||||
file_content = FILE_HEAD + "\n\n"
|
||||
file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
|
||||
if a_type == "kFE4M3fn":
|
||||
filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
|
||||
elif result_dict_tmp is sm_75_result_dict:
|
||||
filename = f"sm75_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
|
||||
else:
|
||||
filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
|
||||
|
||||
filename = filename.lower()
|
||||
|
||||
with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
|
||||
f.write(file_content)
|
||||
with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
|
||||
f.write(file_content)
|
||||
|
||||
if not SUPPORT_FP8 and kernel_selector_str != FILE_HEAD_COMMENT:
|
||||
kernel_selector_str += (
|
||||
|
||||
@@ -26,7 +26,6 @@
|
||||
#include "quantization/gptq_marlin/marlin.cuh"
|
||||
#include "quantization/gptq_marlin/marlin_dtypes.cuh"
|
||||
#include "quantization/gptq_marlin/dequant.h"
|
||||
#include "quantization/gptq_marlin/marlin_mma.h"
|
||||
#include "core/scalar_type.hpp"
|
||||
|
||||
#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t) \
|
||||
@@ -36,7 +35,7 @@
|
||||
|
||||
namespace MARLIN_NAMESPACE_NAME {
|
||||
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
|
||||
|
||||
template <typename scalar_t, // compute dtype, half or nv_float16
|
||||
const vllm::ScalarTypeId b_type_id, // weight MarlinScalarType id
|
||||
@@ -85,6 +84,146 @@ __global__ void Marlin(
|
||||
|
||||
#else
|
||||
|
||||
// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
|
||||
// output/accumulation.
|
||||
template <vllm::ScalarTypeId type_id, int k_size = 16>
|
||||
__device__ inline void mma(
|
||||
const typename MarlinScalarType<type_id>::FragA& a_frag,
|
||||
const typename MarlinScalarType<type_id>::FragB& frag_b,
|
||||
typename MarlinScalarType<type_id>::FragC& frag_c, int idx = 0) {
|
||||
const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
|
||||
const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
|
||||
using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
|
||||
if constexpr (k_size == 16) {
|
||||
if constexpr (std::is_same<scalar_t, half>::value) {
|
||||
float* c = reinterpret_cast<float*>(&frag_c);
|
||||
asm volatile(
|
||||
"mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
|
||||
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
|
||||
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
|
||||
: "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
|
||||
"f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
|
||||
} else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
|
||||
float* c = reinterpret_cast<float*>(&frag_c);
|
||||
asm volatile(
|
||||
"mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
|
||||
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
|
||||
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
|
||||
: "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
|
||||
"f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
|
||||
} else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
|
||||
float* c = reinterpret_cast<float*>(&frag_c);
|
||||
asm volatile(
|
||||
"mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
|
||||
"{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
|
||||
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
|
||||
: "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "f"(c[0]),
|
||||
"f"(c[1]), "f"(c[2]), "f"(c[3]));
|
||||
} else if constexpr (std::is_same<scalar_t, int8_t>::value) {
|
||||
int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
|
||||
asm volatile(
|
||||
"mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
|
||||
"{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
|
||||
: "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
|
||||
: "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "r"(c[0]),
|
||||
"r"(c[1]), "r"(c[2]), "r"(c[3]));
|
||||
}
|
||||
} else if (k_size == 32) {
|
||||
if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
|
||||
float* c = reinterpret_cast<float*>(&frag_c);
|
||||
asm volatile(
|
||||
"mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
|
||||
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
|
||||
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
|
||||
: "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
|
||||
"f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
|
||||
} else if constexpr (std::is_same<scalar_t, int8_t>::value) {
|
||||
int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
|
||||
asm volatile(
|
||||
"mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
|
||||
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
|
||||
: "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
|
||||
: "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
|
||||
"r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <vllm::ScalarTypeId type_id, int k_size = 16>
|
||||
__device__ inline void mma_trans(
|
||||
const typename MarlinScalarType<type_id>::FragA& a_frag,
|
||||
const typename MarlinScalarType<type_id>::FragB& frag_b,
|
||||
const typename MarlinScalarType<type_id>::FragB& frag_b2,
|
||||
typename MarlinScalarType<type_id>::FragC& frag_c) {
|
||||
const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
|
||||
const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
|
||||
const uint32_t* b2 = reinterpret_cast<const uint32_t*>(&frag_b2);
|
||||
float* c = reinterpret_cast<float*>(&frag_c);
|
||||
using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
|
||||
if constexpr (k_size == 16) {
|
||||
if constexpr (std::is_same<scalar_t, half>::value) {
|
||||
float* c = reinterpret_cast<float*>(&frag_c);
|
||||
asm volatile(
|
||||
"mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
|
||||
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
|
||||
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
|
||||
: "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
|
||||
"f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
|
||||
} else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
|
||||
float* c = reinterpret_cast<float*>(&frag_c);
|
||||
asm volatile(
|
||||
"mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
|
||||
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
|
||||
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
|
||||
: "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
|
||||
"f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
|
||||
} else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
|
||||
float* c = reinterpret_cast<float*>(&frag_c);
|
||||
asm volatile(
|
||||
"mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
|
||||
"{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
|
||||
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
|
||||
: "r"(b[0]), "r"(b2[0]), "r"(a[0]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
|
||||
"f"(c[3]));
|
||||
} else if constexpr (std::is_same<scalar_t, int8_t>::value) {
|
||||
int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
|
||||
asm volatile(
|
||||
"mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
|
||||
"{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
|
||||
: "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
|
||||
: "r"(b[0]), "r"(b2[0]), "r"(a[0]), "r"(c[0]), "r"(c[1]), "r"(c[2]),
|
||||
"r"(c[3]));
|
||||
}
|
||||
} else {
|
||||
if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
|
||||
float* c = reinterpret_cast<float*>(&frag_c);
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 1200
|
||||
asm volatile(
|
||||
"mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
|
||||
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
|
||||
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
|
||||
: "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
|
||||
"f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
|
||||
#else
|
||||
asm volatile(
|
||||
"mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
|
||||
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
|
||||
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
|
||||
: "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
|
||||
"f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
|
||||
#endif
|
||||
} else if constexpr (std::is_same<scalar_t, int8_t>::value) {
|
||||
int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
|
||||
asm volatile(
|
||||
"mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
|
||||
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
|
||||
: "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
|
||||
: "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
|
||||
"r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Instruction for loading a full 16x16 matrix fragment of operand A from shared
|
||||
// memory, directly in tensor core layout.
|
||||
template <int count, vllm::ScalarTypeId type_id>
|
||||
@@ -300,20 +439,9 @@ __global__ void Marlin(
|
||||
if constexpr (a_type_id == vllm::kFE4M3fn.id()) return;
|
||||
#endif
|
||||
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
|
||||
// Turing TensorCore only supports fp16 and int8
|
||||
if constexpr (a_type_id != vllm::kFloat16.id() && a_type_id != vllm::kS8.id())
|
||||
return;
|
||||
#endif
|
||||
|
||||
int num_tokens_past_padded = num_tokens_past_padded_ptr[0];
|
||||
constexpr int moe_block_size = m_block_size_8 ? 8 : (16 * thread_m_blocks);
|
||||
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
|
||||
constexpr bool use_fp16_accum = a_type_id == vllm::kFloat16.id();
|
||||
#else
|
||||
constexpr bool use_fp16_accum = false;
|
||||
#endif
|
||||
using Adtype = MarlinScalarType<a_type_id>;
|
||||
using Cdtype = MarlinScalarType<c_type_id>;
|
||||
|
||||
@@ -490,22 +618,7 @@ __global__ void Marlin(
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
|
||||
|
||||
if constexpr (moe_block_size >= 16)
|
||||
local_count += __shfl_down_sync(0xFFFFFFFF, local_count, 16);
|
||||
if constexpr (moe_block_size >= 8)
|
||||
local_count += __shfl_down_sync(0xFFFFFFFF, local_count, 8);
|
||||
if constexpr (moe_block_size >= 4)
|
||||
local_count += __shfl_down_sync(0xFFFFFFFF, local_count, 4);
|
||||
if constexpr (moe_block_size >= 2)
|
||||
local_count += __shfl_down_sync(0xFFFFFFFF, local_count, 2);
|
||||
|
||||
local_count += __shfl_down_sync(0xFFFFFFFF, local_count, 1);
|
||||
block_num_valid_tokens = local_count;
|
||||
#else
|
||||
block_num_valid_tokens = __reduce_add_sync(0xffffffff, local_count);
|
||||
#endif
|
||||
|
||||
if (lane_id == 0)
|
||||
reinterpret_cast<int*>(sh_new)[0] = block_num_valid_tokens;
|
||||
@@ -905,6 +1018,10 @@ __global__ void Marlin(
|
||||
constexpr int sh_s_size = has_act_order ? (act_s_max_num_groups * s_sh_stride)
|
||||
: (stages * s_sh_stage);
|
||||
int4* sh_s = sh_zp + (stages * zp_sh_stage);
|
||||
// shared memory reused by reduction should be smaller than
|
||||
// shared memory used by weight.
|
||||
static_assert(thread_m_blocks * 16 * thread_n_blocks * 16 / 8 <=
|
||||
stages * b_sh_stage);
|
||||
int4* sh_a = sh_s + sh_s_size;
|
||||
|
||||
// Register storage for double buffer of shared memory reads.
|
||||
@@ -1428,13 +1545,11 @@ __global__ void Marlin(
|
||||
#pragma unroll
|
||||
for (int i = 0; i < thread_m_blocks; i++) {
|
||||
if constexpr (m_block_size_8) {
|
||||
mma_trans<a_type_id, use_fp16_accum>(frag_a[k2][i], frag_b0, frag_b1,
|
||||
frag_c[i][j][0]);
|
||||
mma_trans<a_type_id>(frag_a[k2][i], frag_b0, frag_b1,
|
||||
frag_c[i][j][0]);
|
||||
} else {
|
||||
mma<a_type_id, use_fp16_accum>(frag_a[k2][i], frag_b0,
|
||||
frag_c[i][j][0]);
|
||||
mma<a_type_id, use_fp16_accum>(frag_a[k2][i], frag_b1,
|
||||
frag_c[i][j][1]);
|
||||
mma<a_type_id>(frag_a[k2][i], frag_b0, frag_c[i][j][0]);
|
||||
mma<a_type_id>(frag_a[k2][i], frag_b1, frag_c[i][j][1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1468,12 +1583,10 @@ __global__ void Marlin(
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < thread_m_blocks; i++) {
|
||||
mma<a_type_id, false, 32>(
|
||||
frag_a[k2][i], frag_b[0],
|
||||
(group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][0]);
|
||||
mma<a_type_id, false, 32>(
|
||||
frag_a[k2][i], frag_b[1],
|
||||
(group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][1]);
|
||||
mma<a_type_id, 32>(frag_a[k2][i], frag_b[0],
|
||||
(group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][0]);
|
||||
mma<a_type_id, 32>(frag_a[k2][i], frag_b[1],
|
||||
(group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][1]);
|
||||
}
|
||||
|
||||
if constexpr (group_blocks != -1) {
|
||||
@@ -2019,21 +2132,6 @@ __global__ void Marlin(
|
||||
// While this pattern may not be the most readable, other ways of writing
|
||||
// the loop seemed to noticeably worse performance after compilation.
|
||||
if (slice_iters == 0) {
|
||||
// convert fp16 accum to fp32 for reduction
|
||||
if constexpr (use_fp16_accum) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < (thread_m_blocks * (is_a_8bit ? 2 : 4) * 2); i++) {
|
||||
float* frag_c_part_float = reinterpret_cast<float*>(frag_c) + i * 4;
|
||||
scalar_t* frag_c_part_half =
|
||||
reinterpret_cast<scalar_t*>(frag_c_part_float);
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 3; i >= 0; i--) {
|
||||
frag_c_part_float[i] = Cdtype::num2float(frag_c_part_half[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if constexpr (is_a_8bit) {
|
||||
float frag_a_s[2 * thread_m_blocks];
|
||||
|
||||
|
||||
@@ -142,7 +142,7 @@ typedef struct {
|
||||
|
||||
int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
|
||||
int prob_n, int prob_k, int num_bits, int group_size,
|
||||
bool has_act_order, bool is_k_full, int stages) {
|
||||
bool has_act_order, bool is_k_full) {
|
||||
bool cache_scales_chunk = has_act_order && !is_k_full;
|
||||
|
||||
int tb_n = th_config.thread_n;
|
||||
@@ -160,13 +160,13 @@ int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
|
||||
|
||||
if (cache_scales_chunk) {
|
||||
int load_groups =
|
||||
tb_groups * stages * 2; // Chunk size is 2x pipeline over dim K
|
||||
tb_groups * pipe_stages * 2; // Chunk size is 2x pipeline over dim K
|
||||
load_groups = max(load_groups, 32); // We load at least 32 scale groups
|
||||
return load_groups * tb_n * 2;
|
||||
} else {
|
||||
int tb_scales = tb_groups * tb_n * 2;
|
||||
|
||||
return tb_scales * stages;
|
||||
return tb_scales * pipe_stages;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -174,7 +174,7 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
|
||||
int thread_m_blocks, int prob_m, int prob_n,
|
||||
int prob_k, int num_bits, int group_size,
|
||||
bool has_act_order, bool is_k_full, int has_zp,
|
||||
int is_zp_float, bool is_a_8bit, int stages) {
|
||||
int is_zp_float, bool is_a_8bit) {
|
||||
int pack_factor = 32 / num_bits;
|
||||
|
||||
// Get B size
|
||||
@@ -185,8 +185,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
|
||||
// shm size for block_sorted_ids/rd_block_sorted_ids/block_topk_weights
|
||||
// both of them requires tb_m * 4 bytes (tb_m * int32 or tb_m * float32)
|
||||
int sh_block_meta_size = tb_m * 16;
|
||||
int sh_a_size = stages * (tb_m * tb_k) * (is_a_8bit ? 1 : 2);
|
||||
int sh_b_size = stages * (tb_k * tb_n / pack_factor) * 4;
|
||||
int sh_a_size = pipe_stages * (tb_m * tb_k) * (is_a_8bit ? 1 : 2);
|
||||
int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4;
|
||||
int sh_red_size = tb_m * (tb_n + 8) * 2;
|
||||
int sh_bias_size = tb_n * 2;
|
||||
int tmp_size =
|
||||
@@ -195,8 +195,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
|
||||
|
||||
int sh_s_size =
|
||||
get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
|
||||
group_size, has_act_order, is_k_full, stages);
|
||||
int sh_g_idx_size = has_act_order && !is_k_full ? stages * tb_k / 4 : 0;
|
||||
group_size, has_act_order, is_k_full);
|
||||
int sh_g_idx_size = has_act_order && !is_k_full ? pipe_stages * tb_k / 4 : 0;
|
||||
int sh_zp_size = 0;
|
||||
if (has_zp) {
|
||||
if (is_zp_float)
|
||||
@@ -217,7 +217,7 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
|
||||
int thread_m_blocks, int prob_m, int prob_n, int prob_k,
|
||||
int num_bits, int group_size, bool has_act_order,
|
||||
bool is_k_full, int has_zp, int is_zp_float,
|
||||
bool is_a_8bit, int stages, int max_shared_mem) {
|
||||
int max_shared_mem, bool is_a_8bit) {
|
||||
// Sanity
|
||||
if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
|
||||
th_config.num_threads == -1) {
|
||||
@@ -243,7 +243,7 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
|
||||
int cache_size =
|
||||
get_kernel_cache_size(th_config, m_block_size_8, thread_m_blocks, prob_m,
|
||||
prob_n, prob_k, num_bits, group_size, has_act_order,
|
||||
is_k_full, has_zp, is_zp_float, is_a_8bit, stages);
|
||||
is_k_full, has_zp, is_zp_float, is_a_8bit);
|
||||
return cache_size <= max_shared_mem;
|
||||
}
|
||||
|
||||
@@ -252,7 +252,7 @@ MarlinFuncPtr get_marlin_kernel(
|
||||
const vllm::ScalarType c_type, const vllm::ScalarType s_type,
|
||||
int thread_m_blocks, int thread_n_blocks, int thread_k_blocks,
|
||||
bool m_block_size_8, bool has_act_order, bool has_zp, int group_blocks,
|
||||
int threads, bool is_zp_float, int stages) {
|
||||
int threads, bool is_zp_float) {
|
||||
int num_bits = b_type.size_bits();
|
||||
auto kernel = MarlinDefault;
|
||||
|
||||
@@ -266,8 +266,8 @@ exec_config_t determine_exec_config(
|
||||
const vllm::ScalarType& c_type, const vllm::ScalarType& s_type, int prob_m,
|
||||
int prob_n, int prob_k, int num_experts, int top_k, int thread_m_blocks,
|
||||
bool m_block_size_8, int num_bits, int group_size, bool has_act_order,
|
||||
bool is_k_full, bool has_zp, bool is_zp_float, bool is_a_8bit, int stages,
|
||||
int max_shared_mem, int sms) {
|
||||
bool is_k_full, bool has_zp, bool is_zp_float, int max_shared_mem, int sms,
|
||||
bool is_a_8bit) {
|
||||
exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}};
|
||||
thread_config_t* thread_configs = thread_m_blocks > 1
|
||||
? large_batch_thread_configs
|
||||
@@ -284,15 +284,15 @@ exec_config_t determine_exec_config(
|
||||
|
||||
if (!is_valid_config(th_config, m_block_size_8, thread_m_blocks, prob_m,
|
||||
prob_n, prob_k, num_bits, group_size, has_act_order,
|
||||
is_k_full, has_zp, is_zp_float, is_a_8bit, stages,
|
||||
max_shared_mem - 512)) {
|
||||
is_k_full, has_zp, is_zp_float, max_shared_mem - 512,
|
||||
is_a_8bit)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int cache_size = get_kernel_cache_size(
|
||||
th_config, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k,
|
||||
num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float,
|
||||
is_a_8bit, stages);
|
||||
is_a_8bit);
|
||||
|
||||
int group_blocks = 0;
|
||||
if (!has_act_order) {
|
||||
@@ -303,7 +303,7 @@ exec_config_t determine_exec_config(
|
||||
get_marlin_kernel(a_type, b_type, c_type, s_type, thread_m_blocks,
|
||||
th_config.thread_n / 16, th_config.thread_k / 16,
|
||||
m_block_size_8, has_act_order, has_zp, group_blocks,
|
||||
th_config.num_threads, is_zp_float, stages);
|
||||
th_config.num_threads, is_zp_float);
|
||||
|
||||
if (kernel == MarlinDefault) continue;
|
||||
|
||||
@@ -433,14 +433,8 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
|
||||
dev);
|
||||
cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
|
||||
dev);
|
||||
TORCH_CHECK(major_capability * 10 + minor_capability >= 75,
|
||||
"marlin kernel only support Turing or newer GPUs.");
|
||||
int stages = 4;
|
||||
if (major_capability == 7 && minor_capability == 5) {
|
||||
stages = 2;
|
||||
TORCH_CHECK(a_type == vllm::kFloat16 || a_type == vllm::kS8,
|
||||
"Turing only support FP16 or INT8 activation.");
|
||||
}
|
||||
TORCH_CHECK(major_capability * 10 + minor_capability >= 80,
|
||||
"marlin kernel only support Ampere or newer GPUs.");
|
||||
if (a_type == vllm::kFE4M3fn) {
|
||||
TORCH_CHECK(major_capability * 10 + minor_capability >= 89,
|
||||
"FP8 only support Ada Lovelace or newer GPUs.");
|
||||
@@ -467,8 +461,8 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
|
||||
exec_cfg = determine_exec_config(
|
||||
a_type, b_type, c_type, s_type, prob_m, prob_n, prob_k, num_experts,
|
||||
top_k, thread_m_blocks, m_block_size_8, num_bits, group_size,
|
||||
has_act_order, is_k_full, has_zp, is_zp_float, is_a_8bit, stages,
|
||||
max_shared_mem, sms);
|
||||
has_act_order, is_k_full, has_zp, is_zp_float, max_shared_mem, sms,
|
||||
is_a_8bit);
|
||||
thread_tfg = exec_cfg.tb_cfg;
|
||||
}
|
||||
|
||||
@@ -485,7 +479,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
|
||||
TORCH_CHECK(is_valid_config(thread_tfg, m_block_size_8, thread_m_blocks,
|
||||
prob_m, prob_n, prob_k, num_bits, group_size,
|
||||
has_act_order, is_k_full, has_zp, is_zp_float,
|
||||
is_a_8bit, stages, max_shared_mem),
|
||||
max_shared_mem, is_a_8bit),
|
||||
"Invalid thread config: thread_m_blocks = ", thread_m_blocks,
|
||||
", thread_k = ", thread_tfg.thread_k,
|
||||
", thread_n = ", thread_tfg.thread_n,
|
||||
@@ -499,12 +493,12 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
|
||||
int sh_cache_size =
|
||||
get_kernel_cache_size(thread_tfg, m_block_size_8, thread_m_blocks, prob_m,
|
||||
prob_n, prob_k, num_bits, group_size, has_act_order,
|
||||
is_k_full, has_zp, is_zp_float, is_a_8bit, stages);
|
||||
is_k_full, has_zp, is_zp_float, is_a_8bit);
|
||||
|
||||
auto kernel = get_marlin_kernel(
|
||||
a_type, b_type, c_type, s_type, thread_m_blocks, thread_n_blocks,
|
||||
thread_k_blocks, m_block_size_8, has_act_order, has_zp, group_blocks,
|
||||
num_threads, is_zp_float, stages);
|
||||
num_threads, is_zp_float);
|
||||
|
||||
if (kernel == MarlinDefault) {
|
||||
TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
|
||||
@@ -866,4 +860,4 @@ torch::Tensor moe_wna16_marlin_gemm(
|
||||
|
||||
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
||||
m.impl("moe_wna16_marlin_gemm", &moe_wna16_marlin_gemm);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
|
||||
namespace vllm {
|
||||
namespace moe {
|
||||
|
||||
namespace batched_moe_align_block_size {
|
||||
|
||||
// Note num_threads needs to be 1024 for BlockScan Reduction in the kernel.
|
||||
@@ -79,32 +80,17 @@ __global__ void batched_moe_align_block_size_kernel(
|
||||
} // namespace batched_moe_align_block_size
|
||||
|
||||
template <typename scalar_t>
|
||||
__device__ void _moe_align_block_size(
|
||||
__global__ void moe_align_block_size_kernel(
|
||||
const scalar_t* __restrict__ topk_ids,
|
||||
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
|
||||
int32_t* __restrict__ total_tokens_post_pad,
|
||||
int32_t* __restrict__ expert_map, int32_t num_experts,
|
||||
int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
|
||||
int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size,
|
||||
size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded,
|
||||
int32_t max_num_m_blocks, int32_t model_offset, int32_t inactive_expert_id,
|
||||
int32_t topk_num, int32_t* token_mask, bool has_expert_map) {
|
||||
size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded) {
|
||||
extern __shared__ int32_t shared_counts[];
|
||||
|
||||
// Compute input buffer offsets. Typically these will all be 0, except when
|
||||
// using Multi LoRA.
|
||||
int sorted_token_ids_offset = max_num_tokens_padded * model_offset;
|
||||
int expert_ids_offset = max_num_m_blocks * model_offset;
|
||||
int cumsum_offset = (num_experts + 1) * model_offset;
|
||||
|
||||
// Use separate threadblocks to fill sorted_token_ids.
|
||||
// This is safe since the current kernel does not use sorted_token_ids.
|
||||
if (blockIdx.x % 2) {
|
||||
// Initialize sorted_token_ids with numel
|
||||
for (size_t it = threadIdx.x; it < max_num_tokens_padded;
|
||||
it += blockDim.x) {
|
||||
sorted_token_ids[sorted_token_ids_offset + it] = numel;
|
||||
}
|
||||
return;
|
||||
// Initialize sorted_token_ids with numel
|
||||
for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
|
||||
sorted_token_ids[it] = numel;
|
||||
}
|
||||
|
||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||
@@ -126,16 +112,9 @@ __device__ void _moe_align_block_size(
|
||||
if (expert_id >= num_experts) {
|
||||
continue;
|
||||
}
|
||||
if (has_expert_map) {
|
||||
expert_id = expert_map[expert_id];
|
||||
// filter invalid experts
|
||||
if (expert_id == -1) continue;
|
||||
}
|
||||
int warp_idx = expert_id / experts_per_warp;
|
||||
int expert_offset = expert_id % experts_per_warp;
|
||||
int mask = token_mask == nullptr ? 1 : token_mask[i / topk_num];
|
||||
atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset],
|
||||
mask);
|
||||
atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
@@ -156,194 +135,46 @@ __device__ void _moe_align_block_size(
|
||||
int cumsum_val;
|
||||
BlockScan(temp_storage).ExclusiveSum(expert_count, cumsum_val);
|
||||
if (expert_id <= num_experts) {
|
||||
cumsum[cumsum_offset + expert_id] = cumsum_val;
|
||||
cumsum[expert_id] = cumsum_val;
|
||||
}
|
||||
|
||||
if (expert_id == num_experts) {
|
||||
total_tokens_post_pad[model_offset] = cumsum_val;
|
||||
*total_tokens_post_pad = cumsum_val;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x < num_experts) {
|
||||
for (int i = cumsum[cumsum_offset + threadIdx.x];
|
||||
i < cumsum[cumsum_offset + threadIdx.x + 1]; i += block_size) {
|
||||
expert_ids[expert_ids_offset + i / block_size] = threadIdx.x;
|
||||
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
|
||||
i += block_size) {
|
||||
expert_ids[i / block_size] = threadIdx.x;
|
||||
}
|
||||
}
|
||||
|
||||
// Fill remaining expert_ids with 0
|
||||
const size_t fill_start_idx =
|
||||
cumsum[cumsum_offset + num_experts] / block_size + threadIdx.x;
|
||||
for (size_t i = fill_start_idx; i < max_num_m_blocks; i += blockDim.x) {
|
||||
expert_ids[expert_ids_offset + i] = inactive_expert_id;
|
||||
const size_t fill_start_idx = cumsum[num_experts] / block_size + threadIdx.x;
|
||||
const size_t expert_ids_size = CEILDIV(max_num_tokens_padded, block_size);
|
||||
for (size_t i = fill_start_idx; i < expert_ids_size; i += blockDim.x) {
|
||||
expert_ids[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t, int32_t fill_threads>
|
||||
__device__ void _moe_align_block_size_small_batch_expert(
|
||||
const scalar_t* __restrict__ topk_ids,
|
||||
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
|
||||
int32_t* __restrict__ total_tokens_post_pad,
|
||||
int32_t* __restrict__ expert_map, int32_t num_experts, int32_t block_size,
|
||||
size_t numel, int32_t max_num_tokens_padded, int32_t max_num_m_blocks,
|
||||
int32_t inactive_expert_id, int32_t model_offset, int32_t topk_num,
|
||||
int32_t* token_mask, bool has_expert_map) {
|
||||
// Compute input buffer offsets. Typically these will all be 0, except when
|
||||
// using Multi LoRA.
|
||||
int sorted_token_ids_offset = max_num_tokens_padded * model_offset;
|
||||
int expert_ids_offset = max_num_m_blocks * model_offset;
|
||||
|
||||
// Use an additional group of threads to fill sorted_token_ids.
|
||||
// Since the current kernel will use sorted_token_ids afterward,
|
||||
// we fill sorted_token_ids within the same threadblock to make
|
||||
// synchronization easier.
|
||||
if (threadIdx.x < fill_threads) {
|
||||
// Initialize sorted_token_ids with numel
|
||||
for (size_t it = threadIdx.x; it < max_num_tokens_padded;
|
||||
it += fill_threads) {
|
||||
sorted_token_ids[sorted_token_ids_offset + it] = numel;
|
||||
}
|
||||
// Three __syncthreads() corresponding to the other threads
|
||||
__syncthreads();
|
||||
__syncthreads();
|
||||
__syncthreads();
|
||||
return;
|
||||
}
|
||||
|
||||
const size_t tid = threadIdx.x - fill_threads;
|
||||
const size_t stride = blockDim.x - fill_threads;
|
||||
|
||||
extern __shared__ int32_t shared_mem[];
|
||||
int32_t* cumsum = shared_mem;
|
||||
int32_t* tokens_cnts = (int32_t*)(shared_mem + num_experts + 1);
|
||||
|
||||
for (int i = 0; i < num_experts; ++i) {
|
||||
tokens_cnts[(tid + 1) * num_experts + i] = 0;
|
||||
}
|
||||
|
||||
for (size_t i = tid; i < numel; i += stride) {
|
||||
int32_t expert_id = topk_ids[i];
|
||||
if (has_expert_map) {
|
||||
expert_id = expert_map[expert_id];
|
||||
// filter invalid expert
|
||||
if (expert_id == -1) continue;
|
||||
}
|
||||
int mask = token_mask == nullptr ? 1 : token_mask[i / topk_num];
|
||||
tokens_cnts[(tid + 1) * num_experts + expert_id] += mask;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (tid < num_experts) {
|
||||
tokens_cnts[tid] = 0;
|
||||
for (int i = 1; i <= stride; ++i) {
|
||||
tokens_cnts[i * num_experts + tid] +=
|
||||
tokens_cnts[(i - 1) * num_experts + tid];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (tid == 0) {
|
||||
cumsum[0] = 0;
|
||||
for (int i = 1; i <= num_experts; ++i) {
|
||||
cumsum[i] =
|
||||
cumsum[i - 1] +
|
||||
CEILDIV(tokens_cnts[stride * num_experts + i - 1], block_size) *
|
||||
block_size;
|
||||
}
|
||||
total_tokens_post_pad[model_offset] =
|
||||
static_cast<int32_t>(cumsum[num_experts]);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (tid < num_experts) {
|
||||
for (int i = cumsum[tid]; i < cumsum[tid + 1]; i += block_size) {
|
||||
expert_ids[expert_ids_offset + i / block_size] = tid;
|
||||
}
|
||||
}
|
||||
|
||||
// Fill remaining expert_ids with 0
|
||||
const size_t fill_start_idx = cumsum[num_experts] / block_size + tid;
|
||||
for (size_t i = fill_start_idx; i < max_num_m_blocks; i += stride) {
|
||||
expert_ids[expert_ids_offset + i] = inactive_expert_id;
|
||||
}
|
||||
|
||||
for (size_t i = tid; i < numel; i += stride) {
|
||||
int32_t expert_id = topk_ids[i];
|
||||
if (has_expert_map) {
|
||||
expert_id = expert_map[expert_id];
|
||||
// filter invalid expert
|
||||
if (expert_id == -1) continue;
|
||||
}
|
||||
int32_t rank_post_pad =
|
||||
tokens_cnts[tid * num_experts + expert_id] + cumsum[expert_id];
|
||||
|
||||
if (token_mask == nullptr || token_mask[i / topk_num]) {
|
||||
sorted_token_ids[sorted_token_ids_offset + rank_post_pad] = i;
|
||||
++tokens_cnts[tid * num_experts + expert_id];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__device__ void _count_and_sort_expert_tokens(
|
||||
const scalar_t* __restrict__ topk_ids,
|
||||
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
|
||||
int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts,
|
||||
int32_t max_num_tokens_padded, int32_t* __restrict__ token_mask,
|
||||
int32_t model_offset, int32_t topk_num, bool has_expert_map) {
|
||||
const size_t tid = blockIdx.y * blockDim.x + threadIdx.x;
|
||||
const size_t stride = blockDim.x * gridDim.y;
|
||||
|
||||
for (size_t i = tid; i < numel; i += stride) {
|
||||
int32_t expert_id = topk_ids[i];
|
||||
if (expert_id >= num_experts) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (has_expert_map) {
|
||||
expert_id = expert_map[expert_id];
|
||||
// filter invalid experts
|
||||
if (expert_id == -1) continue;
|
||||
}
|
||||
|
||||
if (token_mask == nullptr || token_mask[i / topk_num]) {
|
||||
int32_t rank_post_pad = atomicAdd(
|
||||
&cumsum_buffer[(model_offset * (num_experts + 1)) + expert_id], 1);
|
||||
sorted_token_ids[max_num_tokens_padded * model_offset + rank_post_pad] =
|
||||
i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__global__ void moe_align_block_size_kernel(
|
||||
const scalar_t* __restrict__ topk_ids,
|
||||
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
|
||||
int32_t* __restrict__ total_tokens_post_pad,
|
||||
int32_t* __restrict__ expert_map, int32_t num_experts,
|
||||
int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size,
|
||||
size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded,
|
||||
int32_t topk_num, bool has_expert_map) {
|
||||
_moe_align_block_size(
|
||||
topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
|
||||
num_experts, padded_num_experts, experts_per_warp, block_size, numel,
|
||||
cumsum, max_num_tokens_padded, CEILDIV(max_num_tokens_padded, block_size),
|
||||
0, 0, topk_num, nullptr, has_expert_map);
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__global__ void count_and_sort_expert_tokens_kernel(
|
||||
const scalar_t* __restrict__ topk_ids,
|
||||
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
|
||||
int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts,
|
||||
int32_t max_num_tokens_padded, int32_t topk_num, bool has_expert_map) {
|
||||
_count_and_sort_expert_tokens(
|
||||
topk_ids, sorted_token_ids, cumsum_buffer, expert_map, numel, num_experts,
|
||||
max_num_tokens_padded, nullptr, 0, topk_num, has_expert_map);
|
||||
size_t numel, int32_t num_experts) {
|
||||
const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const size_t stride = blockDim.x * gridDim.x;
|
||||
|
||||
for (size_t i = tid; i < numel; i += stride) {
|
||||
int32_t expert_id = topk_ids[i];
|
||||
if (expert_id >= num_experts) {
|
||||
continue;
|
||||
}
|
||||
int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1);
|
||||
sorted_token_ids[rank_post_pad] = i;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t, int TOPK>
|
||||
@@ -362,111 +193,78 @@ __global__ void moe_sum_kernel(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t, int32_t fill_threads>
|
||||
template <typename scalar_t>
|
||||
__global__ void moe_align_block_size_small_batch_expert_kernel(
|
||||
const scalar_t* __restrict__ topk_ids,
|
||||
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
|
||||
int32_t* __restrict__ total_tokens_post_pad,
|
||||
int32_t* __restrict__ expert_map, int32_t num_experts, int32_t block_size,
|
||||
size_t numel, int32_t max_num_tokens_padded, int32_t topk_num,
|
||||
bool has_expert_map) {
|
||||
_moe_align_block_size_small_batch_expert<scalar_t, fill_threads>(
|
||||
topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
|
||||
num_experts, block_size, numel, max_num_tokens_padded,
|
||||
CEILDIV(max_num_tokens_padded, block_size), 0, 0, topk_num, nullptr,
|
||||
has_expert_map);
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__global__ void moe_lora_align_block_size_kernel(
|
||||
scalar_t* __restrict__ topk_ids, int32_t* __restrict__ token_lora_mapping,
|
||||
int64_t block_size, int32_t* __restrict__ expert_map, int num_experts,
|
||||
int max_loras, size_t numel, int max_num_tokens_padded,
|
||||
int max_num_m_blocks, int32_t* __restrict__ sorted_token_ids,
|
||||
int32_t* __restrict__ expert_ids, int32_t topk_num,
|
||||
int32_t* total_tokens_post_pad, int32_t* adapter_enabled,
|
||||
int32_t* __restrict__ cumsum, int32_t experts_per_warp,
|
||||
int32_t padded_num_experts, int32_t* lora_ids,
|
||||
int32_t* __restrict__ token_mask, bool has_expert_map) {
|
||||
int lora_idx = blockIdx.x / 2;
|
||||
int lora_id = lora_ids[lora_idx];
|
||||
if (lora_id == -1 || adapter_enabled[lora_id] == 0) {
|
||||
return;
|
||||
int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
|
||||
int32_t block_size, size_t numel, int32_t max_num_tokens_padded) {
|
||||
// Initialize sorted_token_ids with numel
|
||||
for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
|
||||
sorted_token_ids[it] = numel;
|
||||
}
|
||||
|
||||
// Populate the token_mask based on the token-LoRA mapping
|
||||
int num_tokens = numel / topk_num;
|
||||
if (threadIdx.x == 0) {
|
||||
total_tokens_post_pad[lora_id] = 0;
|
||||
const size_t tid = threadIdx.x;
|
||||
const size_t stride = blockDim.x;
|
||||
|
||||
for (int i = 0; i < num_tokens; i++) {
|
||||
token_mask[(lora_id * num_tokens) + i] =
|
||||
(int)token_lora_mapping[i] == lora_id;
|
||||
extern __shared__ int32_t shared_mem[];
|
||||
int32_t* cumsum = shared_mem;
|
||||
int32_t* tokens_cnts = (int32_t*)(shared_mem + num_experts + 1);
|
||||
|
||||
for (int i = 0; i < num_experts; ++i) {
|
||||
tokens_cnts[(threadIdx.x + 1) * num_experts + i] = 0;
|
||||
}
|
||||
|
||||
for (size_t i = tid; i < numel; i += stride) {
|
||||
++tokens_cnts[(threadIdx.x + 1) * num_experts + topk_ids[i]];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x < num_experts) {
|
||||
tokens_cnts[threadIdx.x] = 0;
|
||||
for (int i = 1; i <= blockDim.x; ++i) {
|
||||
tokens_cnts[i * num_experts + threadIdx.x] +=
|
||||
tokens_cnts[(i - 1) * num_experts + threadIdx.x];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
_moe_align_block_size(
|
||||
topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
|
||||
num_experts, padded_num_experts, experts_per_warp, block_size, numel,
|
||||
cumsum, max_num_tokens_padded, max_num_m_blocks, lora_id, -1, topk_num,
|
||||
&token_mask[(lora_id * num_tokens)], has_expert_map);
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__global__ void lora_count_and_sort_expert_tokens_kernel(
|
||||
const scalar_t* __restrict__ topk_ids,
|
||||
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
|
||||
int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts,
|
||||
int32_t max_num_tokens_padded, int32_t topk_num, int32_t* token_mask,
|
||||
int32_t* lora_ids, bool has_expert_map) {
|
||||
int lora_idx = blockIdx.x;
|
||||
int lora_id = lora_ids[lora_idx];
|
||||
if (lora_id == -1) {
|
||||
return;
|
||||
}
|
||||
|
||||
int num_tokens = numel / topk_num;
|
||||
|
||||
_count_and_sort_expert_tokens(
|
||||
topk_ids, sorted_token_ids, cumsum_buffer, expert_map, numel, num_experts,
|
||||
max_num_tokens_padded, &token_mask[(lora_id * num_tokens)], lora_id,
|
||||
topk_num, has_expert_map);
|
||||
}
|
||||
|
||||
template <typename scalar_t, int32_t fill_threads>
|
||||
__global__ void moe_lora_align_block_size_small_batch_expert_kernel(
|
||||
scalar_t* __restrict__ topk_ids, int32_t* token_lora_mapping,
|
||||
int64_t block_size, int32_t* __restrict__ expert_map, int num_experts,
|
||||
int max_loras, size_t numel, int max_num_tokens_padded,
|
||||
int max_num_m_blocks, int32_t* __restrict__ sorted_token_ids,
|
||||
int32_t* __restrict__ expert_ids, int topk_num,
|
||||
int32_t* total_tokens_post_pad, int32_t* adapter_enabled, int32_t* lora_ids,
|
||||
int32_t* token_mask, bool has_expert_map) {
|
||||
int lora_idx = blockIdx.x;
|
||||
int lora_id = lora_ids[lora_idx];
|
||||
if (lora_id == -1 || adapter_enabled[lora_id] == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
int num_tokens = numel / topk_num;
|
||||
if (threadIdx.x == 0) {
|
||||
total_tokens_post_pad[lora_id] = 0;
|
||||
|
||||
for (int i = 0; i < num_tokens; i++) {
|
||||
token_mask[(lora_id * num_tokens) + i] =
|
||||
(int)token_lora_mapping[i] == lora_id;
|
||||
cumsum[0] = 0;
|
||||
for (int i = 1; i <= num_experts; ++i) {
|
||||
cumsum[i] =
|
||||
cumsum[i - 1] +
|
||||
CEILDIV(tokens_cnts[blockDim.x * num_experts + i - 1], block_size) *
|
||||
block_size;
|
||||
}
|
||||
*total_tokens_post_pad = static_cast<int32_t>(cumsum[num_experts]);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
_moe_align_block_size_small_batch_expert<scalar_t, fill_threads>(
|
||||
topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
|
||||
num_experts, block_size, numel, max_num_tokens_padded, max_num_m_blocks,
|
||||
-1, lora_id, topk_num, &token_mask[(lora_id * num_tokens)],
|
||||
has_expert_map);
|
||||
if (threadIdx.x < num_experts) {
|
||||
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
|
||||
i += block_size) {
|
||||
expert_ids[i / block_size] = threadIdx.x;
|
||||
}
|
||||
}
|
||||
|
||||
// Fill remaining expert_ids with 0
|
||||
const size_t fill_start_idx = cumsum[num_experts] / block_size + threadIdx.x;
|
||||
const size_t expert_ids_size = CEILDIV(max_num_tokens_padded, block_size);
|
||||
for (size_t i = fill_start_idx; i < expert_ids_size; i += blockDim.x) {
|
||||
expert_ids[i] = 0;
|
||||
}
|
||||
|
||||
for (size_t i = tid; i < numel; i += stride) {
|
||||
int32_t expert_id = topk_ids[i];
|
||||
int32_t rank_post_pad =
|
||||
tokens_cnts[threadIdx.x * num_experts + expert_id] + cumsum[expert_id];
|
||||
sorted_token_ids[rank_post_pad] = i;
|
||||
++tokens_cnts[threadIdx.x * num_experts + expert_id];
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace moe
|
||||
@@ -477,8 +275,7 @@ __global__ void moe_lora_align_block_size_small_batch_expert_kernel(
|
||||
void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
||||
int64_t block_size, torch::Tensor sorted_token_ids,
|
||||
torch::Tensor experts_ids,
|
||||
torch::Tensor num_tokens_post_pad,
|
||||
std::optional<torch::Tensor> maybe_expert_map) {
|
||||
torch::Tensor num_tokens_post_pad) {
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
int64_t padded_num_experts =
|
||||
@@ -490,19 +287,14 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
||||
// BlockScan uses 1024 threads and assigns one thread per expert.
|
||||
TORCH_CHECK(padded_num_experts < 1024,
|
||||
"padded_num_experts must be less than 1024");
|
||||
auto options_int =
|
||||
torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
|
||||
bool has_expert_map = maybe_expert_map.has_value();
|
||||
torch::Tensor expert_map;
|
||||
if (has_expert_map) {
|
||||
expert_map = maybe_expert_map.value();
|
||||
} else {
|
||||
expert_map = torch::empty({0}, options_int);
|
||||
}
|
||||
|
||||
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
|
||||
topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
|
||||
// calc needed amount of shared mem for `cumsum` tensors
|
||||
auto options_int =
|
||||
torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
|
||||
torch::Tensor cumsum_buffer =
|
||||
torch::empty({num_experts + 1}, options_int);
|
||||
bool small_batch_expert_mode =
|
||||
(topk_ids.numel() < 1024) && (num_experts <= 64);
|
||||
|
||||
@@ -512,58 +304,43 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
||||
((threads + 1) * num_experts + (num_experts + 1)) *
|
||||
sizeof(int32_t);
|
||||
|
||||
// threadIdx.x >= fill_threads: counting experts and aligning
|
||||
// threadIdx.x < fill_threads: filling sorted_token_ids
|
||||
constexpr int32_t fill_threads = 256;
|
||||
auto small_batch_expert_kernel =
|
||||
vllm::moe::moe_align_block_size_small_batch_expert_kernel<
|
||||
scalar_t, fill_threads>;
|
||||
small_batch_expert_kernel<<<1, fill_threads + threads,
|
||||
shared_mem_size, stream>>>(
|
||||
scalar_t>;
|
||||
small_batch_expert_kernel<<<1, threads, shared_mem_size, stream>>>(
|
||||
topk_ids.data_ptr<scalar_t>(),
|
||||
sorted_token_ids.data_ptr<int32_t>(),
|
||||
experts_ids.data_ptr<int32_t>(),
|
||||
num_tokens_post_pad.data_ptr<int32_t>(),
|
||||
expert_map.data_ptr<int32_t>(), num_experts, block_size,
|
||||
topk_ids.numel(), sorted_token_ids.size(0), topk_ids.size(1),
|
||||
has_expert_map);
|
||||
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
|
||||
topk_ids.numel(), sorted_token_ids.size(0));
|
||||
} else {
|
||||
torch::Tensor cumsum_buffer =
|
||||
torch::empty({num_experts + 1}, options_int);
|
||||
auto align_kernel = vllm::moe::moe_align_block_size_kernel<scalar_t>;
|
||||
|
||||
size_t num_warps = CEILDIV(padded_num_experts, experts_per_warp);
|
||||
size_t shared_mem_size =
|
||||
num_warps * experts_per_warp * sizeof(int32_t);
|
||||
|
||||
// launch two threadblocks
|
||||
// blockIdx.x == 0: counting experts and aligning
|
||||
// blockIdx.x == 1: filling sorted_token_ids
|
||||
align_kernel<<<2, threads, shared_mem_size, stream>>>(
|
||||
align_kernel<<<1, threads, shared_mem_size, stream>>>(
|
||||
topk_ids.data_ptr<scalar_t>(),
|
||||
sorted_token_ids.data_ptr<int32_t>(),
|
||||
experts_ids.data_ptr<int32_t>(),
|
||||
num_tokens_post_pad.data_ptr<int32_t>(),
|
||||
expert_map.data_ptr<int32_t>(), num_experts, padded_num_experts,
|
||||
experts_per_warp, block_size, topk_ids.numel(),
|
||||
cumsum_buffer.data_ptr<int32_t>(), sorted_token_ids.size(0),
|
||||
topk_ids.size(1), has_expert_map);
|
||||
num_tokens_post_pad.data_ptr<int32_t>(), num_experts,
|
||||
padded_num_experts, experts_per_warp, block_size,
|
||||
topk_ids.numel(), cumsum_buffer.data_ptr<int32_t>(),
|
||||
sorted_token_ids.size(0));
|
||||
|
||||
const int block_threads = std::min(256, (int)threads);
|
||||
const int num_blocks =
|
||||
(topk_ids.numel() + block_threads - 1) / block_threads;
|
||||
const int max_blocks = 65535;
|
||||
const int actual_blocks = std::min(num_blocks, max_blocks);
|
||||
dim3 gridDims(1, actual_blocks);
|
||||
|
||||
auto sort_kernel =
|
||||
vllm::moe::count_and_sort_expert_tokens_kernel<scalar_t>;
|
||||
sort_kernel<<<gridDims, block_threads, 0, stream>>>(
|
||||
sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
|
||||
topk_ids.data_ptr<scalar_t>(),
|
||||
sorted_token_ids.data_ptr<int32_t>(),
|
||||
cumsum_buffer.data_ptr<int32_t>(), expert_map.data_ptr<int32_t>(),
|
||||
topk_ids.numel(), num_experts, sorted_token_ids.size(0),
|
||||
topk_ids.size(1), has_expert_map);
|
||||
cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel(), num_experts);
|
||||
}
|
||||
});
|
||||
}
|
||||
@@ -637,123 +414,3 @@ void moe_sum(torch::Tensor& input, // [num_tokens, topk, hidden_size]
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void moe_lora_align_block_size(
|
||||
torch::Tensor topk_ids, torch::Tensor token_lora_mapping,
|
||||
int64_t num_experts, int64_t block_size, int64_t max_loras,
|
||||
int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
|
||||
torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
|
||||
torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
|
||||
torch::Tensor lora_ids, std::optional<torch::Tensor> maybe_expert_map) {
|
||||
const int topk_num = topk_ids.size(1);
|
||||
|
||||
TORCH_CHECK(block_size > 0, "block_size should be greater than 0. ");
|
||||
|
||||
int device_max_shared_mem;
|
||||
auto dev = topk_ids.get_device();
|
||||
cudaDeviceGetAttribute(&device_max_shared_mem,
|
||||
cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
int64_t padded_num_experts =
|
||||
((num_experts + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
|
||||
|
||||
// BlockScan uses 1024 threads and assigns one thread per expert.
|
||||
TORCH_CHECK(padded_num_experts < 1024,
|
||||
"padded_num_experts must be less than 1024");
|
||||
|
||||
auto options_int =
|
||||
torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
|
||||
torch::Tensor token_mask =
|
||||
torch::empty({max_loras * topk_ids.size(0)}, options_int);
|
||||
bool has_expert_map = maybe_expert_map.has_value();
|
||||
torch::Tensor expert_map;
|
||||
if (has_expert_map) {
|
||||
expert_map = maybe_expert_map.value();
|
||||
} else {
|
||||
expert_map = torch::empty({0}, options_int);
|
||||
}
|
||||
|
||||
VLLM_DISPATCH_INTEGRAL_TYPES(
|
||||
topk_ids.scalar_type(), "moe_lora_align_sum_kernel", [&] {
|
||||
bool small_batch_expert_mode =
|
||||
(topk_ids.numel() < 1024) && (num_experts <= 64);
|
||||
|
||||
if (small_batch_expert_mode) {
|
||||
const int32_t num_thread = max((int32_t)num_experts, 128);
|
||||
const int32_t shared_mem =
|
||||
(num_thread + 1) * num_experts * sizeof(int32_t) +
|
||||
(num_experts + 1) * sizeof(int32_t);
|
||||
if (shared_mem > device_max_shared_mem) {
|
||||
TORCH_CHECK(false, "Shared memory usage exceeds device limit.");
|
||||
}
|
||||
|
||||
// threadIdx.x >= fill_threads: counting experts and aligning
|
||||
// threadIdx.x < fill_threads: filling sorted_token_ids
|
||||
constexpr int32_t fill_threads = 256;
|
||||
|
||||
dim3 blockDim(num_thread + fill_threads);
|
||||
auto kernel =
|
||||
vllm::moe::moe_lora_align_block_size_small_batch_expert_kernel<
|
||||
scalar_t, fill_threads>;
|
||||
AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
|
||||
(void*)kernel, shared_mem));
|
||||
kernel<<<max_loras, blockDim, shared_mem, stream>>>(
|
||||
topk_ids.data_ptr<scalar_t>(),
|
||||
token_lora_mapping.data_ptr<int32_t>(), block_size,
|
||||
expert_map.data_ptr<int32_t>(), num_experts, max_loras,
|
||||
topk_ids.numel(), max_num_tokens_padded, max_num_m_blocks,
|
||||
sorted_token_ids.data_ptr<int32_t>(),
|
||||
expert_ids.data_ptr<int32_t>(), topk_num,
|
||||
num_tokens_post_pad.data_ptr<int32_t>(),
|
||||
adapter_enabled.data_ptr<int32_t>(), lora_ids.data_ptr<int32_t>(),
|
||||
token_mask.data_ptr<int32_t>(), has_expert_map);
|
||||
} else {
|
||||
int num_thread = 1024;
|
||||
dim3 blockDim(num_thread);
|
||||
size_t num_warps = CEILDIV(padded_num_experts, WARP_SIZE);
|
||||
|
||||
size_t shared_mem_size = num_warps * WARP_SIZE * sizeof(int32_t);
|
||||
|
||||
// cumsum buffer
|
||||
torch::Tensor cumsum =
|
||||
torch::zeros({max_loras * (num_experts + 1)}, options_int);
|
||||
|
||||
auto align_kernel =
|
||||
vllm::moe::moe_lora_align_block_size_kernel<scalar_t>;
|
||||
|
||||
// launch two threadblocks for each lora
|
||||
// blockIdx.x % 2 == 0: counting experts and aligning
|
||||
// blockIdx.x % 2 == 1: filling sorted_token_ids
|
||||
align_kernel<<<max_loras * 2, blockDim, shared_mem_size, stream>>>(
|
||||
topk_ids.data_ptr<scalar_t>(),
|
||||
token_lora_mapping.data_ptr<int32_t>(), block_size,
|
||||
expert_map.data_ptr<int32_t>(), num_experts, max_loras,
|
||||
topk_ids.numel(), max_num_tokens_padded, max_num_m_blocks,
|
||||
sorted_token_ids.data_ptr<int32_t>(),
|
||||
expert_ids.data_ptr<int32_t>(), topk_num,
|
||||
num_tokens_post_pad.data_ptr<int32_t>(),
|
||||
adapter_enabled.data_ptr<int32_t>(), cumsum.data_ptr<int32_t>(),
|
||||
WARP_SIZE, padded_num_experts, lora_ids.data_ptr<int32_t>(),
|
||||
token_mask.data_ptr<int32_t>(), has_expert_map);
|
||||
|
||||
const int block_threads = std::min(256, (int)num_thread);
|
||||
const int num_blocks =
|
||||
(topk_ids.numel() + block_threads - 1) / block_threads;
|
||||
|
||||
const int max_blocks = 65535;
|
||||
const int actual_blocks = std::min(num_blocks, max_blocks);
|
||||
|
||||
dim3 gridDims(max_loras, actual_blocks);
|
||||
auto sort_kernel =
|
||||
vllm::moe::lora_count_and_sort_expert_tokens_kernel<scalar_t>;
|
||||
|
||||
sort_kernel<<<gridDims, block_threads, 0, stream>>>(
|
||||
topk_ids.data_ptr<scalar_t>(),
|
||||
sorted_token_ids.data_ptr<int32_t>(), cumsum.data_ptr<int32_t>(),
|
||||
expert_map.data_ptr<int32_t>(), topk_ids.numel(), num_experts,
|
||||
max_num_tokens_padded, topk_num, token_mask.data_ptr<int32_t>(),
|
||||
lora_ids.data_ptr<int32_t>(), has_expert_map);
|
||||
}
|
||||
});
|
||||
}
|
||||
174
csrc/moe/moe_lora_align_sum_kernels.cu
Normal file
174
csrc/moe/moe_lora_align_sum_kernels.cu
Normal file
@@ -0,0 +1,174 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <time.h>
|
||||
#include <torch/all.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/cuda/Atomic.cuh>
|
||||
|
||||
#include "../cuda_compat.h"
|
||||
#include "../dispatch_utils.h"
|
||||
#include "core/math.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
|
||||
int32_t col) {
|
||||
return row * total_col + col;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
// TODO: Refactor common parts with moe_align_sum_kernels
|
||||
template <typename scalar_t, typename token_cnts_t>
|
||||
__global__ void moe_lora_align_sum_kernel(
|
||||
scalar_t* __restrict__ topk_ids, int32_t* token_lora_mapping,
|
||||
int64_t block_size, int num_experts, int max_loras, size_t numel,
|
||||
int max_num_tokens_padded, int max_num_m_blocks,
|
||||
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
|
||||
int topk_num, int32_t* total_tokens_post_pad, int32_t* adapter_enabled,
|
||||
int32_t* lora_ids) {
|
||||
const size_t tokens_per_thread = div_ceil(numel, blockDim.x);
|
||||
const size_t start_idx = threadIdx.x * tokens_per_thread;
|
||||
|
||||
int lora_idx = blockIdx.x;
|
||||
int lora_id = lora_ids[lora_idx];
|
||||
if (lora_id == -1 || adapter_enabled[lora_id] == 0) {
|
||||
return;
|
||||
}
|
||||
extern __shared__ int32_t shared_mem[];
|
||||
int32_t* cumsum = shared_mem;
|
||||
token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + num_experts + 1);
|
||||
|
||||
// Initialize sorted_token_ids with numel
|
||||
for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
|
||||
sorted_token_ids[lora_id * max_num_tokens_padded + it] = numel;
|
||||
}
|
||||
|
||||
// Initialize expert_ids with -1
|
||||
for (size_t it = threadIdx.x; it < max_num_m_blocks; it += blockDim.x) {
|
||||
expert_ids[lora_id * max_num_m_blocks + it] = -1;
|
||||
}
|
||||
|
||||
// Initialize total_tokens_post_pad with 0
|
||||
if (threadIdx.x == 0) {
|
||||
total_tokens_post_pad[lora_id] = 0;
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_experts; ++i) {
|
||||
tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
|
||||
}
|
||||
|
||||
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
|
||||
int mask = token_lora_mapping[i / topk_num] == lora_id;
|
||||
int idx = index(num_experts, threadIdx.x + 1, topk_ids[i]);
|
||||
tokens_cnts[idx] += mask;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// For each expert we accumulate the token counts from the different threads.
|
||||
if (threadIdx.x < num_experts) {
|
||||
tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
|
||||
for (int i = 1; i <= blockDim.x; ++i) {
|
||||
tokens_cnts[index(num_experts, i, threadIdx.x)] +=
|
||||
tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// We accumulate the token counts of all experts in thread 0.
|
||||
if (threadIdx.x == 0) {
|
||||
cumsum[0] = 0;
|
||||
for (int i = 1; i <= num_experts; ++i) {
|
||||
cumsum[i] = cumsum[i - 1] +
|
||||
div_ceil(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
|
||||
block_size) *
|
||||
block_size;
|
||||
}
|
||||
total_tokens_post_pad[lora_id] = static_cast<int32_t>(cumsum[num_experts]);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
/**
|
||||
* For each expert, each thread processes the tokens of the corresponding
|
||||
* blocks and stores the corresponding expert_id for each block.
|
||||
*/
|
||||
if (threadIdx.x < num_experts) {
|
||||
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
|
||||
i += block_size) {
|
||||
expert_ids[index(max_num_m_blocks, lora_id, i / block_size)] =
|
||||
threadIdx.x;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
|
||||
int32_t expert_id = topk_ids[i];
|
||||
/** The cumsum[expert_id] stores the starting index of the tokens that the
|
||||
* expert with expert_id needs to process, and
|
||||
* tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
|
||||
* processed by the expert with expert_id within the current thread's token
|
||||
* shard.
|
||||
*/
|
||||
int32_t rank_post_pad =
|
||||
tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
|
||||
cumsum[expert_id];
|
||||
|
||||
int mask = (int)token_lora_mapping[i / topk_num] == lora_id;
|
||||
atomicAdd(
|
||||
&sorted_token_ids[index(max_num_tokens_padded, lora_id, rank_post_pad)],
|
||||
(i - numel) * mask);
|
||||
tokens_cnts[index(num_experts, threadIdx.x, expert_id)] += mask;
|
||||
}
|
||||
}
|
||||
|
||||
void moe_lora_align_block_size(
|
||||
torch::Tensor topk_ids, torch::Tensor token_lora_mapping,
|
||||
int64_t num_experts, int64_t block_size, int64_t max_loras,
|
||||
int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
|
||||
torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
|
||||
torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
|
||||
torch::Tensor lora_ids) {
|
||||
const int topk_num = topk_ids.size(1);
|
||||
|
||||
TORCH_CHECK(block_size > 0, "block_size should be greater than 0. ");
|
||||
|
||||
int device_max_shared_mem;
|
||||
auto dev = topk_ids.get_device();
|
||||
cudaDeviceGetAttribute(&device_max_shared_mem,
|
||||
cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
const int32_t num_thread = max((int32_t)num_experts, 128); // WARP_SIZE,
|
||||
TORCH_CHECK(num_thread <= 1024,
|
||||
"num_thread must be less than 1024, "
|
||||
"and fallback is not implemented yet.");
|
||||
const int32_t shared_mem = (num_thread + 1) * num_experts * sizeof(int32_t) +
|
||||
(num_experts + 1) * sizeof(int32_t);
|
||||
|
||||
if (shared_mem > device_max_shared_mem) {
|
||||
TORCH_CHECK(false,
|
||||
"Shared memory usage exceeds device limit, and global memory "
|
||||
"fallback is not implemented yet.");
|
||||
}
|
||||
|
||||
VLLM_DISPATCH_INTEGRAL_TYPES(
|
||||
topk_ids.scalar_type(), "moe_lora_align_sum_kernel", [&] {
|
||||
dim3 blockDim(num_thread);
|
||||
auto kernel = moe_lora_align_sum_kernel<scalar_t, int32_t>;
|
||||
AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
|
||||
(void*)kernel, shared_mem));
|
||||
kernel<<<max_loras, blockDim, shared_mem, stream>>>(
|
||||
topk_ids.data_ptr<scalar_t>(),
|
||||
token_lora_mapping.data_ptr<int32_t>(), block_size, num_experts,
|
||||
max_loras, topk_ids.numel(), max_num_tokens_padded,
|
||||
max_num_m_blocks, sorted_token_ids.data_ptr<int32_t>(),
|
||||
expert_ids.data_ptr<int32_t>(), topk_num,
|
||||
num_tokens_post_pad.data_ptr<int32_t>(),
|
||||
adapter_enabled.data_ptr<int32_t>(), lora_ids.data_ptr<int32_t>());
|
||||
});
|
||||
}
|
||||
@@ -11,8 +11,7 @@ void moe_sum(torch::Tensor& input, torch::Tensor& output);
|
||||
void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
||||
int64_t block_size, torch::Tensor sorted_token_ids,
|
||||
torch::Tensor experts_ids,
|
||||
torch::Tensor num_tokens_post_pad,
|
||||
std::optional<torch::Tensor> maybe_expert_map);
|
||||
torch::Tensor num_tokens_post_pad);
|
||||
|
||||
void batched_moe_align_block_size(int64_t max_tokens_per_batch,
|
||||
int64_t block_size,
|
||||
@@ -27,7 +26,7 @@ void moe_lora_align_block_size(
|
||||
int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
|
||||
torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
|
||||
torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
|
||||
torch::Tensor lora_ids, std::optional<torch::Tensor> maybe_expert_map);
|
||||
torch::Tensor lora_ids);
|
||||
#ifndef USE_ROCM
|
||||
torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
|
||||
torch::Tensor b_qweight, torch::Tensor b_scales,
|
||||
|
||||
@@ -19,8 +19,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
||||
"moe_align_block_size(Tensor topk_ids, int num_experts,"
|
||||
" int block_size, Tensor! sorted_token_ids,"
|
||||
" Tensor! experts_ids,"
|
||||
" Tensor! num_tokens_post_pad,"
|
||||
" Tensor? maybe_expert_map) -> ()");
|
||||
" Tensor! num_tokens_post_pad) -> ()");
|
||||
m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
|
||||
|
||||
// Aligning the number of tokens to be processed by each expert such
|
||||
@@ -47,8 +46,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
||||
" Tensor !experts_ids,"
|
||||
" Tensor !num_tokens_post_pad,"
|
||||
" Tensor !adapter_enabled,"
|
||||
" Tensor !lora_ids,"
|
||||
" Tensor? maybe_expert_map) -> () ");
|
||||
" Tensor !lora_ids) -> () ");
|
||||
m.impl("moe_lora_align_block_size", torch::kCUDA, &moe_lora_align_block_size);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user