Compare commits
274 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b6553be1bc | ||
|
|
64a9af5afa | ||
|
|
e4248849ec | ||
|
|
467bef18a3 | ||
|
|
5f1ac1e1d1 | ||
|
|
9368cc90b2 | ||
|
|
32b3946bb4 | ||
|
|
6b1391ca7e | ||
|
|
a3f66e75d1 | ||
|
|
319cb1e351 | ||
|
|
1efef71645 | ||
|
|
646d62f636 | ||
|
|
6cd4ae8acd | ||
|
|
c016047ed7 | ||
|
|
9af6d22e4c | ||
|
|
4589b94032 | ||
|
|
cc867be19c | ||
|
|
3a7cd627a8 | ||
|
|
8058c91108 | ||
|
|
7d44c469fe | ||
|
|
31f58be96a | ||
|
|
ebb2f383b8 | ||
|
|
c1c7dbbeeb | ||
|
|
5cf2daea9a | ||
|
|
b8089195b4 | ||
|
|
770e5dcdb8 | ||
|
|
c57c9415b1 | ||
|
|
01810f9236 | ||
|
|
59abbd84f9 | ||
|
|
95a6568b5c | ||
|
|
0eca5eacd0 | ||
|
|
12e5829221 | ||
|
|
3a4d417707 | ||
|
|
8335667c22 | ||
|
|
e1c4380d4c | ||
|
|
e31ae3de36 | ||
|
|
2ffb9b6e07 | ||
|
|
cda10fa3e2 | ||
|
|
c123bc33f9 | ||
|
|
b9a1791e2c | ||
|
|
989dcee981 | ||
|
|
3d64d366e0 | ||
|
|
eaa2e51088 | ||
|
|
d77f7fb871 | ||
|
|
2d8476e465 | ||
|
|
88be823d57 | ||
|
|
4e4f63ad45 | ||
|
|
d2f0e7e615 | ||
|
|
122cdca5f6 | ||
|
|
cf02f9b283 | ||
|
|
c4296b1a27 | ||
|
|
66c508b137 | ||
|
|
84166fee97 | ||
|
|
6e0cd10f72 | ||
|
|
e010688f50 | ||
|
|
441b65d8c7 | ||
|
|
46ecc57973 | ||
|
|
b6a3a9f76d | ||
|
|
ca27f0f9c1 | ||
|
|
aad30bd306 | ||
|
|
94ecee6282 | ||
|
|
8267f9916f | ||
|
|
7353492a47 | ||
|
|
7661e92ef8 | ||
|
|
f168b85725 | ||
|
|
da511d54d8 | ||
|
|
65c69444b1 | ||
|
|
94870359cd | ||
|
|
0d49483ea9 | ||
|
|
90b78ec5f9 | ||
|
|
91a2ef98ea | ||
|
|
3da2313d78 | ||
|
|
b61dc5f972 | ||
|
|
f8a1a2d108 | ||
|
|
3465b87ef8 | ||
|
|
c8134bea15 | ||
|
|
cb6d572e85 | ||
|
|
87360308b7 | ||
|
|
aa49f14832 | ||
|
|
9ef9173cfa | ||
|
|
85e2b7bb13 | ||
|
|
61059bee40 | ||
|
|
ec89524f50 | ||
|
|
f20f9f063b | ||
|
|
9bc8bb07cf | ||
|
|
1aeb925f34 | ||
|
|
188a4590d8 | ||
|
|
18093084be | ||
|
|
da40380214 | ||
|
|
8fc57501d3 | ||
|
|
af7fc84fd2 | ||
|
|
0678b52251 | ||
|
|
25b918eee6 | ||
|
|
a408820f2f | ||
|
|
c56ed8bb0e | ||
|
|
78dcf56cb3 | ||
|
|
b2fac67130 | ||
|
|
23027e2daf | ||
|
|
c3fd4d669a | ||
|
|
ef3f98b59f | ||
|
|
7ee2590478 | ||
|
|
53a5a0ce30 | ||
|
|
d459fae0a2 | ||
|
|
c8dcc15921 | ||
|
|
8f4ffbd373 | ||
|
|
5f2cd251d2 | ||
|
|
02658c2dfe | ||
|
|
01dc9a76db | ||
|
|
35cf32df30 | ||
|
|
8711bc5e68 | ||
|
|
2669a0d7b5 | ||
|
|
8e972d9c44 | ||
|
|
3336c8cfbe | ||
|
|
b124e1085b | ||
|
|
41aa578428 | ||
|
|
8d646c2e53 | ||
|
|
5d6d1adf15 | ||
|
|
1409ef9134 | ||
|
|
4555143ea7 | ||
|
|
52dceb172d | ||
|
|
abd7df2fca | ||
|
|
b712be98c7 | ||
|
|
a8da78eac9 | ||
|
|
5d96533e22 | ||
|
|
4de790fcad | ||
|
|
b5fd9506c1 | ||
|
|
135cf55cd1 | ||
|
|
6cac54f4d1 | ||
|
|
6865fe0074 | ||
|
|
e31446b6c8 | ||
|
|
bdf13965ab | ||
|
|
fa98d77773 | ||
|
|
01eee40536 | ||
|
|
19bdaf32b1 | ||
|
|
02f0c7b220 | ||
|
|
d054da1992 | ||
|
|
4b7817c119 | ||
|
|
d00dd65cd4 | ||
|
|
d81edded69 | ||
|
|
476844d44c | ||
|
|
4e68ae5e59 | ||
|
|
4e88723f32 | ||
|
|
118ff92111 | ||
|
|
ec2dcd80bc | ||
|
|
42243fbda0 | ||
|
|
6d18ed2a2e | ||
|
|
f32fcd9444 | ||
|
|
d32aa2e670 | ||
|
|
cc977286e7 | ||
|
|
17430e3653 | ||
|
|
1282bd812e | ||
|
|
bdce64f236 | ||
|
|
9e6f61e8c3 | ||
|
|
8655f47f37 | ||
|
|
4ce42f9204 | ||
|
|
8a57872b2a | ||
|
|
5bc1ad6cee | ||
|
|
9112b443a0 | ||
|
|
c57d577e8d | ||
|
|
ca2f6b9c30 | ||
|
|
20133cfee2 | ||
|
|
ebb1ec9318 | ||
|
|
5b168b6d7a | ||
|
|
9760fd8f6a | ||
|
|
b9f61e1387 | ||
|
|
d6fd3a33b8 | ||
|
|
432ec9926e | ||
|
|
2b102d51ad | ||
|
|
aa54a7bf7b | ||
|
|
2ad6194a02 | ||
|
|
c594cbf565 | ||
|
|
a35ca765a5 | ||
|
|
6aa8f9a4e7 | ||
|
|
1bc86a3da1 | ||
|
|
bbfa0c61d1 | ||
|
|
20079c6e36 | ||
|
|
9a1b9b99d7 | ||
|
|
8bf507d766 | ||
|
|
306d60401d | ||
|
|
f2c3f66d59 | ||
|
|
0f5e0d567e | ||
|
|
c55d804672 | ||
|
|
749f5bdd38 | ||
|
|
2a50ef5760 | ||
|
|
b8b904795d | ||
|
|
ba5111f237 | ||
|
|
1e123529d7 | ||
|
|
dff80b0e42 | ||
|
|
7782464a17 | ||
|
|
0f71e24034 | ||
|
|
1dab4d5718 | ||
|
|
7f21e8052b | ||
|
|
5a8641638a | ||
|
|
f49239cb45 | ||
|
|
2dbe8c0774 | ||
|
|
84ec470fca | ||
|
|
b29ca5c4d5 | ||
|
|
ec6833c5e9 | ||
|
|
e1fadf1197 | ||
|
|
43ff405b90 | ||
|
|
fba02e3bd1 | ||
|
|
4577fc9abb | ||
|
|
5f1d0c8118 | ||
|
|
c3bb9f2331 | ||
|
|
8f8900cee9 | ||
|
|
6acb7a6285 | ||
|
|
4f4a6b844a | ||
|
|
4d0a1541be | ||
|
|
77b6e74fe2 | ||
|
|
5acf828d99 | ||
|
|
3987e2ae96 | ||
|
|
77164dad5e | ||
|
|
3de3eadf5b | ||
|
|
3132290a14 | ||
|
|
1aa2f81b43 | ||
|
|
d54af615d5 | ||
|
|
a1cc9f33a3 | ||
|
|
a521ef06e5 | ||
|
|
64eaf5fe05 | ||
|
|
d1d61f3351 | ||
|
|
32ce3cf7c9 | ||
|
|
d58f9c7f7a | ||
|
|
c29034037d | ||
|
|
1b7cfd5a36 | ||
|
|
da4b69d0b4 | ||
|
|
c9479b2920 | ||
|
|
6f2909405e | ||
|
|
b169d5f7b6 | ||
|
|
f8977c233f | ||
|
|
f274581f44 | ||
|
|
0b1447f890 | ||
|
|
24d0ef8970 | ||
|
|
7fcfd954ff | ||
|
|
e740d07f07 | ||
|
|
a652e71dd0 | ||
|
|
34d6c447c4 | ||
|
|
972eddf7c9 | ||
|
|
fd7bb88d72 | ||
|
|
3c49dbdd03 | ||
|
|
1661a9c28f | ||
|
|
8e882ffdc0 | ||
|
|
26b4fa45be | ||
|
|
515b413ebf | ||
|
|
269d901734 | ||
|
|
7951d78738 | ||
|
|
6dbe5b5c93 | ||
|
|
643622ba46 | ||
|
|
a09c7ca9f2 | ||
|
|
0e98964e94 | ||
|
|
c68b5c63eb | ||
|
|
fced756923 | ||
|
|
321331b8ae | ||
|
|
6e4cea1cc5 | ||
|
|
435fa95444 | ||
|
|
4c2b38ce9e | ||
|
|
d781930f90 | ||
|
|
ce75efeecb | ||
|
|
aa42561e40 | ||
|
|
de65fc8e1e | ||
|
|
0c492b7824 | ||
|
|
0f0926b43f | ||
|
|
7f2c1a87e9 | ||
|
|
b78f844a67 | ||
|
|
5e13c07d00 | ||
|
|
774c5fde30 | ||
|
|
9a21e331ff | ||
|
|
3e9ce609bd | ||
|
|
794ae1f551 | ||
|
|
d73a9457a5 | ||
|
|
a3896c7f02 | ||
|
|
51e98e4ffd | ||
|
|
e56f44d9ec | ||
|
|
e0cbad4e30 | ||
|
|
b48d5cca16 |
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
"""
|
"""
|
||||||
LM eval harness on model to compare vs HF baseline computed offline.
|
LM eval harness on model to compare vs HF baseline computed offline.
|
||||||
Configs are found in configs/$MODEL.yaml
|
Configs are found in configs/$MODEL.yaml
|
||||||
|
|||||||
@@ -113,7 +113,7 @@ WARNING: The benchmarking script will save json results by itself, so please do
|
|||||||
|
|
||||||
### Visualizing the results
|
### Visualizing the results
|
||||||
|
|
||||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
|
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
|
||||||
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
|
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
|
||||||
If you do not see the table, please wait till the benchmark finish running.
|
If you do not see the table, please wait till the benchmark finish running.
|
||||||
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
from lmdeploy.serve.openai.api_client import APIClient
|
from lmdeploy.serve.openai.api_client import APIClient
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import datetime
|
import datetime
|
||||||
import json
|
import json
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
steps:
|
steps:
|
||||||
- label: "Build wheel - CUDA 12.8"
|
- label: "Build wheel - CUDA 12.8"
|
||||||
|
id: build-wheel-cuda-12-8
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
@@ -11,6 +12,7 @@ steps:
|
|||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- label: "Build wheel - CUDA 12.6"
|
- label: "Build wheel - CUDA 12.6"
|
||||||
|
id: build-wheel-cuda-12-6
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
@@ -28,6 +30,7 @@ steps:
|
|||||||
|
|
||||||
- label: "Build wheel - CUDA 11.8"
|
- label: "Build wheel - CUDA 11.8"
|
||||||
# depends_on: block-build-cu118-wheel
|
# depends_on: block-build-cu118-wheel
|
||||||
|
id: build-wheel-cuda-11-8
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
@@ -44,6 +47,7 @@ steps:
|
|||||||
|
|
||||||
- label: "Build release image"
|
- label: "Build release image"
|
||||||
depends_on: block-release-image-build
|
depends_on: block-release-image-build
|
||||||
|
id: build-release-image
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
@@ -51,6 +55,18 @@ steps:
|
|||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
|
|
||||||
|
- label: "Annotate release workflow"
|
||||||
|
depends_on:
|
||||||
|
- build-release-image
|
||||||
|
- build-wheel-cuda-12-8
|
||||||
|
- build-wheel-cuda-12-6
|
||||||
|
- build-wheel-cuda-11-8
|
||||||
|
id: annotate-release-workflow
|
||||||
|
agents:
|
||||||
|
queue: cpu_queue_postmerge
|
||||||
|
commands:
|
||||||
|
- "bash .buildkite/scripts/annotate-release.sh"
|
||||||
|
|
||||||
- label: "Build and publish TPU release image"
|
- label: "Build and publish TPU release image"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
if: build.env("NIGHTLY") == "1"
|
if: build.env("NIGHTLY") == "1"
|
||||||
@@ -70,9 +86,10 @@ steps:
|
|||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- input: "Provide Release version here"
|
- input: "Provide Release version here"
|
||||||
|
id: input-release-version
|
||||||
fields:
|
fields:
|
||||||
- text: "What is the release version?"
|
- text: "What is the release version?"
|
||||||
key: "release-version"
|
key: release-version
|
||||||
|
|
||||||
- block: "Build CPU release image"
|
- block: "Build CPU release image"
|
||||||
key: block-cpu-release-image-build
|
key: block-cpu-release-image-build
|
||||||
|
|||||||
31
.buildkite/scripts/annotate-release.sh
Executable file
31
.buildkite/scripts/annotate-release.sh
Executable file
@@ -0,0 +1,31 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Get release version and strip leading 'v' if present
|
||||||
|
RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
|
||||||
|
|
||||||
|
if [ -z "$RELEASE_VERSION" ]; then
|
||||||
|
echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
|
||||||
|
To download the wheel:
|
||||||
|
\`\`\`
|
||||||
|
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
||||||
|
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
|
||||||
|
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl .
|
||||||
|
\`\`\`
|
||||||
|
|
||||||
|
To download and upload the image:
|
||||||
|
|
||||||
|
\`\`\`
|
||||||
|
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
|
||||||
|
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
|
||||||
|
docker tag vllm/vllm-openai vllm/vllm-openai:latest
|
||||||
|
docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
|
||||||
|
docker push vllm/vllm-openai:latest
|
||||||
|
docker push vllm/vllm-openai:v${RELEASE_VERSION}
|
||||||
|
\`\`\`
|
||||||
|
EOF
|
||||||
17
.buildkite/scripts/ci-clean-log.sh
Normal file
17
.buildkite/scripts/ci-clean-log.sh
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Usage: ./ci_clean_log.sh ci.log
|
||||||
|
# This script strips timestamps and color codes from CI log files.
|
||||||
|
|
||||||
|
# Check if argument is given
|
||||||
|
if [ $# -lt 1 ]; then
|
||||||
|
echo "Usage: $0 ci.log"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
INPUT_FILE="$1"
|
||||||
|
|
||||||
|
# Strip timestamps
|
||||||
|
sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
|
||||||
|
|
||||||
|
# Strip colorization
|
||||||
|
sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"
|
||||||
@@ -94,6 +94,10 @@ if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
|
|||||||
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
|
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [[ $commands == *"pytest -v -s lora"* ]]; then
|
||||||
|
commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
|
||||||
|
fi
|
||||||
|
|
||||||
#ignore certain kernels tests
|
#ignore certain kernels tests
|
||||||
if [[ $commands == *" kernels/core"* ]]; then
|
if [[ $commands == *" kernels/core"* ]]; then
|
||||||
commands="${commands} \
|
commands="${commands} \
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ set -ex
|
|||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() {
|
remove_docker_container() {
|
||||||
if [[ -n "$container_id" ]]; then
|
if [[ -n "$container_id" ]]; then
|
||||||
|
podman stop --all -t0
|
||||||
podman rm -f "$container_id" || true
|
podman rm -f "$container_id" || true
|
||||||
fi
|
fi
|
||||||
podman system prune -f
|
podman system prune -f
|
||||||
@@ -37,7 +38,7 @@ function cpu_tests() {
|
|||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
|
||||||
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
||||||
pytest -v -s tests/models/language/pooling/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]"
|
pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
|
||||||
}
|
}
|
||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
# All of CPU tests are expected to be finished less than 40 mins.
|
||||||
|
|||||||
@@ -6,72 +6,70 @@ set -ex
|
|||||||
|
|
||||||
# allow to bind to different cores
|
# allow to bind to different cores
|
||||||
CORE_RANGE=${CORE_RANGE:-48-95}
|
CORE_RANGE=${CORE_RANGE:-48-95}
|
||||||
|
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
|
||||||
NUMA_NODE=${NUMA_NODE:-1}
|
NUMA_NODE=${NUMA_NODE:-1}
|
||||||
|
|
||||||
|
export CMAKE_BUILD_PARALLEL_LEVEL=32
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() {
|
remove_docker_container() {
|
||||||
set -e;
|
set -e;
|
||||||
docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true;
|
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
|
||||||
docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true;
|
|
||||||
}
|
}
|
||||||
trap remove_docker_container EXIT
|
trap remove_docker_container EXIT
|
||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
|
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
|
||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
||||||
|
|
||||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
||||||
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
|
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
|
||||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
|
||||||
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
|
|
||||||
|
|
||||||
function cpu_tests() {
|
function cpu_tests() {
|
||||||
set -e
|
set -e
|
||||||
export NUMA_NODE=$2
|
export NUMA_NODE=$2
|
||||||
export BUILDKITE_BUILD_NUMBER=$3
|
|
||||||
|
|
||||||
# offline inference
|
# offline inference
|
||||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
|
||||||
set -e
|
set -e
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||||
|
|
||||||
# Run basic model test
|
# Run basic model test
|
||||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -v -s tests/kernels/test_cache.py -m cpu_model
|
pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
|
||||||
pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
|
pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
|
||||||
pytest -v -s tests/models/decoder_only/language -m cpu_model
|
pytest -v -s tests/models/language/generation -m cpu_model
|
||||||
pytest -v -s tests/models/embedding/language -m cpu_model
|
pytest -v -s tests/models/language/pooling -m cpu_model
|
||||||
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
|
pytest -v -s tests/models/multimodal/generation \
|
||||||
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
|
--ignore=tests/models/multimodal/generation/test_mllama.py \
|
||||||
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
|
--ignore=tests/models/multimodal/generation/test_pixtral.py \
|
||||||
|
-m cpu_model"
|
||||||
|
|
||||||
# Run compressed-tensor test
|
# Run compressed-tensor test
|
||||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -s -v \
|
pytest -s -v \
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
|
||||||
|
|
||||||
# Run AWQ test
|
# Run AWQ test
|
||||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -s -v \
|
VLLM_USE_V1=0 pytest -s -v \
|
||||||
tests/quantization/test_ipex_quant.py"
|
tests/quantization/test_ipex_quant.py"
|
||||||
|
|
||||||
# Run chunked-prefill and prefix-cache test
|
# Run chunked-prefill and prefix-cache test
|
||||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -s -v -k cpu_model \
|
pytest -s -v -k cpu_model \
|
||||||
tests/basic_correctness/test_chunked_prefill.py"
|
tests/basic_correctness/test_chunked_prefill.py"
|
||||||
|
|
||||||
# online serving
|
# online serving
|
||||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
export VLLM_CPU_KVCACHE_SPACE=10
|
|
||||||
export VLLM_CPU_OMP_THREADS_BIND=$1
|
|
||||||
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
|
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
|
||||||
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||||
python3 benchmarks/benchmark_serving.py \
|
python3 benchmarks/benchmark_serving.py \
|
||||||
@@ -83,7 +81,7 @@ function cpu_tests() {
|
|||||||
--tokenizer facebook/opt-125m"
|
--tokenizer facebook/opt-125m"
|
||||||
|
|
||||||
# Run multi-lora tests
|
# Run multi-lora tests
|
||||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -s -v \
|
pytest -s -v \
|
||||||
tests/lora/test_qwen2vl.py"
|
tests/lora/test_qwen2vl.py"
|
||||||
@@ -91,4 +89,4 @@ function cpu_tests() {
|
|||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
# All of CPU tests are expected to be finished less than 40 mins.
|
||||||
export -f cpu_tests
|
export -f cpu_tests
|
||||||
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"
|
timeout 1h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
|
||||||
|
|||||||
@@ -2,102 +2,184 @@
|
|||||||
|
|
||||||
set -xu
|
set -xu
|
||||||
|
|
||||||
|
|
||||||
|
remove_docker_container() {
|
||||||
|
docker rm -f tpu-test || true;
|
||||||
|
docker rm -f vllm-tpu || true;
|
||||||
|
}
|
||||||
|
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
|
||||||
|
# Remove the container that might not be cleaned up in the previous run.
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
# Build the docker image.
|
# Build the docker image.
|
||||||
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
|
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
|
||||||
|
|
||||||
# Set up cleanup.
|
# Set up cleanup.
|
||||||
remove_docker_container() { docker rm -f tpu-test || true; }
|
cleanup_docker() {
|
||||||
trap remove_docker_container EXIT
|
# Get Docker's root directory
|
||||||
# Remove the container that might not be cleaned up in the previous run.
|
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
||||||
remove_docker_container
|
if [ -z "$docker_root" ]; then
|
||||||
|
echo "Failed to determine Docker root directory."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "Docker root directory: $docker_root"
|
||||||
|
# Check disk usage of the filesystem where Docker's root directory is located
|
||||||
|
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||||
|
# Define the threshold
|
||||||
|
threshold=70
|
||||||
|
if [ "$disk_usage" -gt "$threshold" ]; then
|
||||||
|
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||||
|
# Remove dangling images (those that are not tagged and not used by any container)
|
||||||
|
docker image prune -f
|
||||||
|
# Remove unused volumes / force the system prune for old images as well.
|
||||||
|
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
||||||
|
echo "Docker images and volumes cleanup completed."
|
||||||
|
else
|
||||||
|
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
cleanup_docker
|
||||||
|
|
||||||
# For HF_TOKEN.
|
# For HF_TOKEN.
|
||||||
source /etc/environment
|
source /etc/environment
|
||||||
# Run a simple end-to-end example.
|
|
||||||
docker run --privileged --net host --shm-size=16G -it \
|
docker run --privileged --net host --shm-size=16G -it \
|
||||||
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
||||||
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
|
vllm-tpu /bin/bash -c '
|
||||||
&& python3 -m pip install pytest pytest-asyncio tpu-info \
|
set -e # Exit immediately if a command exits with a non-zero status.
|
||||||
&& python3 -m pip install lm_eval[api]==0.4.4 \
|
set -u # Treat unset variables as an error.
|
||||||
&& export VLLM_XLA_CACHE_PATH= \
|
|
||||||
&& export VLLM_USE_V1=1 \
|
|
||||||
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
|
|
||||||
&& echo HARDWARE \
|
|
||||||
&& tpu-info \
|
|
||||||
&& { \
|
|
||||||
echo TEST_0: Running test_perf.py; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \
|
|
||||||
echo TEST_0_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_1: Running test_compilation.py; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \
|
|
||||||
echo TEST_1_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_2: Running test_basic.py; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \
|
|
||||||
echo TEST_2_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_3: Running test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
|
|
||||||
echo TEST_3_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_4: Running test_quantization_accuracy.py; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \
|
|
||||||
echo TEST_4_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_5: Running examples/offline_inference/tpu.py; \
|
|
||||||
python3 /workspace/vllm/examples/offline_inference/tpu.py; \
|
|
||||||
echo TEST_5_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_6: Running test_tpu_model_runner.py; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \
|
|
||||||
echo TEST_6_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_7: Running test_sampler.py; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \
|
|
||||||
echo TEST_7_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_8: Running test_topk_topp_sampler.py; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \
|
|
||||||
echo TEST_8_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_9: Running test_multimodal.py; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \
|
|
||||||
echo TEST_9_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_10: Running test_pallas.py; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \
|
|
||||||
echo TEST_10_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_11: Running test_struct_output_generate.py; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \
|
|
||||||
echo TEST_11_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_12: Running test_moe_pallas.py; \
|
|
||||||
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \
|
|
||||||
echo TEST_12_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
# Disable the TPU LoRA tests until the feature is activated
|
|
||||||
# & { \
|
|
||||||
# echo TEST_13: Running test_moe_pallas.py; \
|
|
||||||
# python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/; \
|
|
||||||
# echo TEST_13_EXIT_CODE: \$?; \
|
|
||||||
# } & \
|
|
||||||
wait \
|
|
||||||
&& echo 'All tests have attempted to run. Check logs for individual test statuses and exit codes.' \
|
|
||||||
"
|
|
||||||
|
|
||||||
|
echo "--- Starting script inside Docker container ---"
|
||||||
|
|
||||||
|
# Create results directory
|
||||||
|
RESULTS_DIR=$(mktemp -d)
|
||||||
|
# If mktemp fails, set -e will cause the script to exit.
|
||||||
|
echo "Results will be stored in: $RESULTS_DIR"
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
echo "--- Installing Python dependencies ---"
|
||||||
|
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||||
|
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||||
|
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4
|
||||||
|
echo "--- Python dependencies installed ---"
|
||||||
|
export VLLM_USE_V1=1
|
||||||
|
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||||
|
export VLLM_XLA_CACHE_PATH=
|
||||||
|
echo "Using VLLM V1"
|
||||||
|
|
||||||
|
echo "--- Hardware Information ---"
|
||||||
|
tpu-info
|
||||||
|
echo "--- Starting Tests ---"
|
||||||
|
set +e
|
||||||
|
overall_script_exit_code=0
|
||||||
|
|
||||||
|
# --- Test Definitions ---
|
||||||
|
# If a test fails, this function will print logs and will not cause the main script to exit.
|
||||||
|
run_test() {
|
||||||
|
local test_num=$1
|
||||||
|
local test_name=$2
|
||||||
|
local test_command=$3
|
||||||
|
local log_file="$RESULTS_DIR/test_${test_num}.log"
|
||||||
|
local actual_exit_code
|
||||||
|
|
||||||
|
echo "--- TEST_$test_num: Running $test_name ---"
|
||||||
|
|
||||||
|
# Execute the test command.
|
||||||
|
eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
|
||||||
|
actual_exit_code=$?
|
||||||
|
|
||||||
|
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
|
||||||
|
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
|
||||||
|
|
||||||
|
if [ "$actual_exit_code" -ne 0 ]; then
|
||||||
|
echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
|
||||||
|
echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
|
||||||
|
if [ -f "$log_file" ]; then
|
||||||
|
cat "$log_file" >&2
|
||||||
|
else
|
||||||
|
echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
|
||||||
|
fi
|
||||||
|
echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
|
||||||
|
return "$actual_exit_code" # Return the failure code
|
||||||
|
else
|
||||||
|
echo "TEST_$test_num ($test_name) PASSED."
|
||||||
|
return 0 # Return success
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Helper function to call run_test and update the overall script exit code
|
||||||
|
run_and_track_test() {
|
||||||
|
local test_num_arg="$1"
|
||||||
|
local test_name_arg="$2"
|
||||||
|
local test_command_arg="$3"
|
||||||
|
|
||||||
|
# Run the test
|
||||||
|
run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
|
||||||
|
local test_specific_exit_code=$?
|
||||||
|
|
||||||
|
# If the test failed, set the overall script exit code to 1
|
||||||
|
if [ "$test_specific_exit_code" -ne 0 ]; then
|
||||||
|
# No need for extra echo here, run_test already logged the failure.
|
||||||
|
overall_script_exit_code=1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# --- Actual Test Execution ---
|
||||||
|
run_and_track_test 0 "test_perf.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py"
|
||||||
|
run_and_track_test 1 "test_compilation.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py"
|
||||||
|
run_and_track_test 2 "test_basic.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
|
||||||
|
run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
|
||||||
|
run_and_track_test 4 "test_quantization_accuracy.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
|
||||||
|
run_and_track_test 5 "examples/offline_inference/tpu.py" \
|
||||||
|
"python3 /workspace/vllm/examples/offline_inference/tpu.py"
|
||||||
|
run_and_track_test 6 "test_tpu_model_runner.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py"
|
||||||
|
run_and_track_test 7 "test_sampler.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py"
|
||||||
|
run_and_track_test 8 "test_topk_topp_sampler.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py"
|
||||||
|
run_and_track_test 9 "test_multimodal.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
|
||||||
|
run_and_track_test 10 "test_pallas.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
|
||||||
|
run_and_track_test 11 "test_struct_output_generate.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
|
||||||
|
run_and_track_test 12 "test_moe_pallas.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
|
||||||
|
run_and_track_test 13 "test_lora.py" \
|
||||||
|
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
|
||||||
|
run_and_track_test 14 "test_tpu_qkv_linear.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
|
||||||
|
run_and_track_test 15 "test_spmd_model_weight_loading.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
|
||||||
|
|
||||||
|
# After all tests have been attempted, exit with the overall status.
|
||||||
|
if [ "$overall_script_exit_code" -ne 0 ]; then
|
||||||
|
echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
|
||||||
|
else
|
||||||
|
echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
|
||||||
|
fi
|
||||||
|
exit "$overall_script_exit_code"
|
||||||
|
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
|
||||||
|
|
||||||
|
# Capture the exit code of the docker run command
|
||||||
|
DOCKER_RUN_EXIT_CODE=$?
|
||||||
|
|
||||||
|
# The trap will run for cleanup.
|
||||||
|
# Exit the main script with the Docker run command's exit code.
|
||||||
|
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
|
||||||
|
echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
|
||||||
|
exit "$DOCKER_RUN_EXIT_CODE"
|
||||||
|
else
|
||||||
|
echo "Docker run command completed successfully."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
# TODO: This test fails because it uses RANDOM_SEED sampling
|
# TODO: This test fails because it uses RANDOM_SEED sampling
|
||||||
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
||||||
|
|||||||
18
.buildkite/scripts/rerun-test.sh
Normal file
18
.buildkite/scripts/rerun-test.sh
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Usage: ./rerun_test.sh path/to/test.py::test_name
|
||||||
|
|
||||||
|
# Check if argument is given
|
||||||
|
if [ $# -lt 1 ]; then
|
||||||
|
echo "Usage: $0 path/to/test.py::test_name"
|
||||||
|
echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
TEST=$1
|
||||||
|
COUNT=1
|
||||||
|
|
||||||
|
while pytest -sv "$TEST"; do
|
||||||
|
COUNT=$((COUNT + 1))
|
||||||
|
echo "RUN NUMBER ${COUNT}"
|
||||||
|
done
|
||||||
24
.buildkite/scripts/tpu/cleanup_docker.sh
Executable file
24
.buildkite/scripts/tpu/cleanup_docker.sh
Executable file
@@ -0,0 +1,24 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
||||||
|
if [ -z "$docker_root" ]; then
|
||||||
|
echo "Failed to determine Docker root directory."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "Docker root directory: $docker_root"
|
||||||
|
# Check disk usage of the filesystem where Docker's root directory is located
|
||||||
|
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||||
|
# Define the threshold
|
||||||
|
threshold=70
|
||||||
|
if [ "$disk_usage" -gt "$threshold" ]; then
|
||||||
|
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||||
|
# Remove dangling images (those that are not tagged and not used by any container)
|
||||||
|
docker image prune -f
|
||||||
|
# Remove unused volumes / force the system prune for old images as well.
|
||||||
|
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
||||||
|
echo "Docker images and volumes cleanup completed."
|
||||||
|
else
|
||||||
|
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||||
|
fi
|
||||||
14
.buildkite/scripts/tpu/config_v6e_1.env
Normal file
14
.buildkite/scripts/tpu/config_v6e_1.env
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
# Environment config
|
||||||
|
TEST_NAME=llama8b
|
||||||
|
CONTAINER_NAME=vllm-tpu
|
||||||
|
|
||||||
|
# vllm config
|
||||||
|
MODEL=meta-llama/Llama-3.1-8B-Instruct
|
||||||
|
MAX_NUM_SEQS=512
|
||||||
|
MAX_NUM_BATCHED_TOKENS=512
|
||||||
|
TENSOR_PARALLEL_SIZE=1
|
||||||
|
MAX_MODEL_LEN=2048
|
||||||
|
DOWNLOAD_DIR=/mnt/disks/persist
|
||||||
|
EXPECTED_THROUGHPUT=8.0
|
||||||
|
INPUT_LEN=1800
|
||||||
|
OUTPUT_LEN=128
|
||||||
102
.buildkite/scripts/tpu/docker_run_bm.sh
Executable file
102
.buildkite/scripts/tpu/docker_run_bm.sh
Executable file
@@ -0,0 +1,102 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
if [ ! -f "$1" ]; then
|
||||||
|
echo "Error: The env file '$1' does not exist."
|
||||||
|
exit 1 # Exit the script with a non-zero status to indicate an error
|
||||||
|
fi
|
||||||
|
|
||||||
|
ENV_FILE=$1
|
||||||
|
|
||||||
|
# For testing on local vm, use `set -a` to export all variables
|
||||||
|
source /etc/environment
|
||||||
|
source $ENV_FILE
|
||||||
|
|
||||||
|
remove_docker_container() {
|
||||||
|
docker rm -f tpu-test || true;
|
||||||
|
docker rm -f vllm-tpu || true;
|
||||||
|
docker rm -f $CONTAINER_NAME || true;
|
||||||
|
}
|
||||||
|
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
|
||||||
|
# Remove the container that might not be cleaned up in the previous run.
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# Build docker image.
|
||||||
|
# TODO: build the image outside the script and share the image with other
|
||||||
|
# tpu test if building time is too long.
|
||||||
|
DOCKER_BUILDKIT=1 docker build \
|
||||||
|
--build-arg max_jobs=16 \
|
||||||
|
--build-arg USE_SCCACHE=1 \
|
||||||
|
--build-arg GIT_REPO_CHECK=0 \
|
||||||
|
--tag vllm/vllm-tpu-bm \
|
||||||
|
--progress plain -f docker/Dockerfile.tpu .
|
||||||
|
|
||||||
|
LOG_ROOT=$(mktemp -d)
|
||||||
|
# If mktemp fails, set -e will cause the script to exit.
|
||||||
|
echo "Results will be stored in: $LOG_ROOT"
|
||||||
|
|
||||||
|
if [ -z "$HF_TOKEN" ]; then
|
||||||
|
echo "Error: HF_TOKEN is not set or is empty."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Make sure mounted disk or dir exists
|
||||||
|
if [ ! -d "$DOWNLOAD_DIR" ]; then
|
||||||
|
echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Run model $MODEL"
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "starting docker...$CONTAINER_NAME"
|
||||||
|
echo
|
||||||
|
docker run \
|
||||||
|
-v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
|
||||||
|
--env-file $ENV_FILE \
|
||||||
|
-e HF_TOKEN="$HF_TOKEN" \
|
||||||
|
-e TARGET_COMMIT=$BUILDKITE_COMMIT \
|
||||||
|
-e MODEL=$MODEL \
|
||||||
|
-e WORKSPACE=/workspace \
|
||||||
|
--name $CONTAINER_NAME \
|
||||||
|
-d \
|
||||||
|
--privileged \
|
||||||
|
--network host \
|
||||||
|
-v /dev/shm:/dev/shm \
|
||||||
|
vllm/vllm-tpu-bm tail -f /dev/null
|
||||||
|
|
||||||
|
echo "run script..."
|
||||||
|
echo
|
||||||
|
docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/hardware_ci/run_bm.sh"
|
||||||
|
|
||||||
|
echo "copy result back..."
|
||||||
|
VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt
|
||||||
|
BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt
|
||||||
|
docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG"
|
||||||
|
docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG"
|
||||||
|
|
||||||
|
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
|
||||||
|
echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput"
|
||||||
|
|
||||||
|
if [ "$BUILDKITE" = "true" ]; then
|
||||||
|
echo "Running inside Buildkite"
|
||||||
|
buildkite-agent artifact upload "$VLLM_LOG"
|
||||||
|
buildkite-agent artifact upload "$BM_LOG"
|
||||||
|
else
|
||||||
|
echo "Not running inside Buildkite"
|
||||||
|
fi
|
||||||
|
|
||||||
|
#
|
||||||
|
# compare the throughput with EXPECTED_THROUGHPUT
|
||||||
|
# and assert meeting the expectation
|
||||||
|
#
|
||||||
|
if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
|
||||||
|
echo "Failed to get the throughput"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then
|
||||||
|
echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
94
.buildkite/scripts/tpu/run_bm.sh
Executable file
94
.buildkite/scripts/tpu/run_bm.sh
Executable file
@@ -0,0 +1,94 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
VLLM_LOG="$WORKSPACE/vllm_log.txt"
|
||||||
|
BM_LOG="$WORKSPACE/bm_log.txt"
|
||||||
|
|
||||||
|
if [ -n "$TARGET_COMMIT" ]; then
|
||||||
|
head_hash=$(git rev-parse HEAD)
|
||||||
|
if [ "$TARGET_COMMIT" != "$head_hash" ]; then
|
||||||
|
echo "Error: target commit $TARGET_COMMIT does not match HEAD: $head_hash"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "model: $MODEL"
|
||||||
|
echo
|
||||||
|
|
||||||
|
#
|
||||||
|
# create a log folder
|
||||||
|
#
|
||||||
|
mkdir "$WORKSPACE/log"
|
||||||
|
|
||||||
|
# TODO: Move to image building.
|
||||||
|
pip install pandas
|
||||||
|
pip install datasets
|
||||||
|
|
||||||
|
#
|
||||||
|
# create sonnet_4x
|
||||||
|
#
|
||||||
|
echo "Create sonnet_4x.txt"
|
||||||
|
echo "" > benchmarks/sonnet_4x.txt
|
||||||
|
for _ in {1..4}
|
||||||
|
do
|
||||||
|
cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
|
||||||
|
done
|
||||||
|
|
||||||
|
#
|
||||||
|
# start vllm service in backend
|
||||||
|
#
|
||||||
|
echo "lanching vllm..."
|
||||||
|
echo "logging to $VLLM_LOG"
|
||||||
|
echo
|
||||||
|
|
||||||
|
VLLM_USE_V1=1 vllm serve $MODEL \
|
||||||
|
--seed 42 \
|
||||||
|
--disable-log-requests \
|
||||||
|
--max-num-seqs $MAX_NUM_SEQS \
|
||||||
|
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
|
||||||
|
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
|
||||||
|
--no-enable-prefix-caching \
|
||||||
|
--download_dir $DOWNLOAD_DIR \
|
||||||
|
--max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
|
||||||
|
|
||||||
|
|
||||||
|
echo "wait for 20 minutes.."
|
||||||
|
echo
|
||||||
|
# sleep 1200
|
||||||
|
# wait for 10 minutes...
|
||||||
|
for i in {1..120}; do
|
||||||
|
# TODO: detect other type of errors.
|
||||||
|
if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
|
||||||
|
echo "Detected RuntimeError, exiting."
|
||||||
|
exit 1
|
||||||
|
elif grep -Fq "Application startup complete" "$VLLM_LOG"; then
|
||||||
|
echo "Application started"
|
||||||
|
break
|
||||||
|
else
|
||||||
|
echo "wait for 10 seconds..."
|
||||||
|
sleep 10
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
#
|
||||||
|
# run test
|
||||||
|
#
|
||||||
|
echo "run benchmark test..."
|
||||||
|
echo "logging to $BM_LOG"
|
||||||
|
echo
|
||||||
|
python benchmarks/benchmark_serving.py \
|
||||||
|
--backend vllm \
|
||||||
|
--model $MODEL \
|
||||||
|
--dataset-name sonnet \
|
||||||
|
--dataset-path benchmarks/sonnet_4x.txt \
|
||||||
|
--sonnet-input-len $INPUT_LEN \
|
||||||
|
--sonnet-output-len $OUTPUT_LEN \
|
||||||
|
--ignore-eos > "$BM_LOG"
|
||||||
|
|
||||||
|
echo "completed..."
|
||||||
|
echo
|
||||||
|
|
||||||
|
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
|
||||||
|
echo "throughput: $throughput"
|
||||||
|
echo
|
||||||
@@ -145,6 +145,7 @@ steps:
|
|||||||
- examples/offline_inference/rlhf_colocate.py
|
- examples/offline_inference/rlhf_colocate.py
|
||||||
- tests/examples/offline_inference/data_parallel.py
|
- tests/examples/offline_inference/data_parallel.py
|
||||||
- tests/v1/test_async_llm_dp.py
|
- tests/v1/test_async_llm_dp.py
|
||||||
|
- tests/v1/engine/test_engine_core_client.py
|
||||||
commands:
|
commands:
|
||||||
# test with tp=2 and external_dp=2
|
# test with tp=2 and external_dp=2
|
||||||
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||||
@@ -154,6 +155,7 @@ steps:
|
|||||||
# test with internal dp
|
# test with internal dp
|
||||||
- python3 ../examples/offline_inference/data_parallel.py
|
- python3 ../examples/offline_inference/data_parallel.py
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||||
|
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
||||||
- pytest -v -s distributed/test_utils.py
|
- pytest -v -s distributed/test_utils.py
|
||||||
- pytest -v -s compile/test_basic_correctness.py
|
- pytest -v -s compile/test_basic_correctness.py
|
||||||
- pytest -v -s distributed/test_pynccl.py
|
- pytest -v -s distributed/test_pynccl.py
|
||||||
@@ -199,8 +201,9 @@ steps:
|
|||||||
- tests/test_sequence
|
- tests/test_sequence
|
||||||
- tests/test_config
|
- tests/test_config
|
||||||
- tests/test_logger
|
- tests/test_logger
|
||||||
|
- tests/test_vllm_port
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
|
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
||||||
# OOM in the CI unless we run this separately
|
# OOM in the CI unless we run this separately
|
||||||
- pytest -v -s tokenization
|
- pytest -v -s tokenization
|
||||||
|
|
||||||
@@ -274,17 +277,6 @@ steps:
|
|||||||
- pytest -v -s samplers
|
- pytest -v -s samplers
|
||||||
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
||||||
|
|
||||||
- label: LogitsProcessor Test # 5min
|
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/model_executor/layers
|
|
||||||
- vllm/model_executor/guided_decoding
|
|
||||||
- tests/test_logits_processor
|
|
||||||
- tests/model_executor/test_guided_processors
|
|
||||||
commands:
|
|
||||||
- pytest -v -s test_logits_processor.py
|
|
||||||
- pytest -v -s model_executor/test_guided_processors.py
|
|
||||||
|
|
||||||
- label: Speculative decoding tests # 40min
|
- label: Speculative decoding tests # 40min
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@@ -297,7 +289,7 @@ steps:
|
|||||||
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
|
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
|
||||||
|
|
||||||
- label: LoRA Test %N # 15min each
|
- label: LoRA Test %N # 15min each
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/lora
|
- vllm/lora
|
||||||
- tests/lora
|
- tests/lora
|
||||||
@@ -328,6 +320,7 @@ steps:
|
|||||||
# these tests need to be separated, cannot combine
|
# these tests need to be separated, cannot combine
|
||||||
- pytest -v -s compile/piecewise/test_simple.py
|
- pytest -v -s compile/piecewise/test_simple.py
|
||||||
- pytest -v -s compile/piecewise/test_toy_llama.py
|
- pytest -v -s compile/piecewise/test_toy_llama.py
|
||||||
|
- pytest -v -s compile/piecewise/test_full_cudagraph.py
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Test # 18min
|
- label: PyTorch Fullgraph Test # 18min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
@@ -397,6 +390,17 @@ steps:
|
|||||||
- pytest -v -s tensorizer_loader
|
- pytest -v -s tensorizer_loader
|
||||||
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
||||||
|
|
||||||
|
- label: Model Executor Test
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
soft_fail: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/model_executor
|
||||||
|
- tests/model_executor
|
||||||
|
commands:
|
||||||
|
- apt-get update && apt-get install -y curl libsodium23
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- pytest -v -s model_executor
|
||||||
|
|
||||||
- label: Benchmarks # 9min
|
- label: Benchmarks # 9min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
working_dir: "/vllm-workspace/.buildkite"
|
working_dir: "/vllm-workspace/.buildkite"
|
||||||
@@ -420,6 +424,9 @@ steps:
|
|||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
- tests/quantization
|
- tests/quantization
|
||||||
commands:
|
commands:
|
||||||
|
# temporary install here since we need nightly, will move to requirements/test.in
|
||||||
|
# after torchao 0.12 release
|
||||||
|
- pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
|
||||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
||||||
|
|
||||||
- label: LM Eval Small Models # 53min
|
- label: LM Eval Small Models # 53min
|
||||||
@@ -617,9 +624,11 @@ steps:
|
|||||||
- vllm/worker/model_runner.py
|
- vllm/worker/model_runner.py
|
||||||
- entrypoints/llm/test_collective_rpc.py
|
- entrypoints/llm/test_collective_rpc.py
|
||||||
- tests/v1/test_async_llm_dp.py
|
- tests/v1/test_async_llm_dp.py
|
||||||
|
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
||||||
- vllm/v1/engine/
|
- vllm/v1/engine/
|
||||||
commands:
|
commands:
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||||
|
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||||
- pytest -v -s ./compile/test_basic_correctness.py
|
- pytest -v -s ./compile/test_basic_correctness.py
|
||||||
- pytest -v -s ./compile/test_wrapper.py
|
- pytest -v -s ./compile/test_wrapper.py
|
||||||
|
|||||||
16
.github/CODEOWNERS
vendored
16
.github/CODEOWNERS
vendored
@@ -10,15 +10,17 @@
|
|||||||
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
|
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
|
||||||
/vllm/model_executor/guided_decoding @mgoin @russellb
|
/vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm
|
||||||
/vllm/multimodal @DarkLight1337 @ywang96
|
/vllm/multimodal @DarkLight1337 @ywang96
|
||||||
/vllm/vllm_flash_attn @LucasWilkinson
|
/vllm/vllm_flash_attn @LucasWilkinson
|
||||||
/vllm/lora @jeejeelee
|
/vllm/lora @jeejeelee
|
||||||
|
/vllm/reasoning @aarnphm
|
||||||
|
/vllm/entrypoints @aarnphm
|
||||||
CMakeLists.txt @tlrmchlsmth
|
CMakeLists.txt @tlrmchlsmth
|
||||||
|
|
||||||
# vLLM V1
|
# vLLM V1
|
||||||
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
||||||
/vllm/v1/structured_output @mgoin @russellb
|
/vllm/v1/structured_output @mgoin @russellb @aarnphm
|
||||||
|
|
||||||
# Test ownership
|
# Test ownership
|
||||||
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
||||||
@@ -27,8 +29,8 @@ CMakeLists.txt @tlrmchlsmth
|
|||||||
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
||||||
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
||||||
/tests/distributed/test_same_node.py @youkaichao
|
/tests/distributed/test_same_node.py @youkaichao
|
||||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
|
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
|
||||||
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb
|
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb @aarnphm
|
||||||
/tests/kernels @tlrmchlsmth @WoosukKwon
|
/tests/kernels @tlrmchlsmth @WoosukKwon
|
||||||
/tests/model_executor/test_guided_processors.py @mgoin @russellb
|
/tests/model_executor/test_guided_processors.py @mgoin @russellb
|
||||||
/tests/models @DarkLight1337 @ywang96
|
/tests/models @DarkLight1337 @ywang96
|
||||||
@@ -38,11 +40,11 @@ CMakeLists.txt @tlrmchlsmth
|
|||||||
/tests/quantization @mgoin @robertgshaw2-redhat
|
/tests/quantization @mgoin @robertgshaw2-redhat
|
||||||
/tests/spec_decode @njhill @LiuXiaoxuanPKU
|
/tests/spec_decode @njhill @LiuXiaoxuanPKU
|
||||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
|
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
||||||
/tests/v1/structured_output @mgoin @russellb
|
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
||||||
/tests/weight_loading @mgoin @youkaichao
|
/tests/weight_loading @mgoin @youkaichao
|
||||||
/tests/lora @jeejeelee
|
/tests/lora @jeejeelee
|
||||||
|
|
||||||
# Docs
|
# Docs
|
||||||
/docs @hmellor
|
/docs @hmellor
|
||||||
mkdocs.yaml @hmellor
|
mkdocs.yaml @hmellor
|
||||||
|
|||||||
10
.github/ISSUE_TEMPLATE/400-bug-report.yml
vendored
10
.github/ISSUE_TEMPLATE/400-bug-report.yml
vendored
@@ -8,6 +8,16 @@ body:
|
|||||||
attributes:
|
attributes:
|
||||||
value: >
|
value: >
|
||||||
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
⚠️ **SECURITY WARNING:** Please review any text you paste to ensure it does not contain sensitive information such as:
|
||||||
|
- API tokens or keys (e.g., Hugging Face tokens, OpenAI API keys)
|
||||||
|
- Passwords or authentication credentials
|
||||||
|
- Private URLs or endpoints
|
||||||
|
- Personal or confidential data
|
||||||
|
|
||||||
|
Consider redacting or replacing sensitive values with placeholders like `<YOUR_TOKEN_HERE>` when sharing configuration or code examples.
|
||||||
- type: textarea
|
- type: textarea
|
||||||
attributes:
|
attributes:
|
||||||
label: Your current environment
|
label: Your current environment
|
||||||
|
|||||||
16
.github/PULL_REQUEST_TEMPLATE.md
vendored
16
.github/PULL_REQUEST_TEMPLATE.md
vendored
@@ -1,6 +1,18 @@
|
|||||||
FILL IN THE PR DESCRIPTION HERE
|
## Essential Elements of an Effective PR Description Checklist
|
||||||
|
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
|
||||||
|
- [ ] The test plan, such as providing test command.
|
||||||
|
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
|
||||||
|
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
|
||||||
|
|
||||||
FIX #xxxx (*link existing issues this PR will resolve*)
|
PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED.
|
||||||
|
|
||||||
|
## Purpose
|
||||||
|
|
||||||
|
## Test Plan
|
||||||
|
|
||||||
|
## Test Result
|
||||||
|
|
||||||
|
## (Optional) Documentation Update
|
||||||
|
|
||||||
<!--- pyml disable-next-line no-emphasis-as-heading -->
|
<!--- pyml disable-next-line no-emphasis-as-heading -->
|
||||||
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
|
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
|
||||||
|
|||||||
14
.github/mergify.yml
vendored
14
.github/mergify.yml
vendored
@@ -36,6 +36,20 @@ pull_request_rules:
|
|||||||
add:
|
add:
|
||||||
- frontend
|
- frontend
|
||||||
|
|
||||||
|
- name: label-llama
|
||||||
|
description: Automatically apply llama label
|
||||||
|
conditions:
|
||||||
|
- or:
|
||||||
|
- files~=^examples/.*llama.*\.py
|
||||||
|
- files~=^tests/.*llama.*\.py
|
||||||
|
- files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
|
||||||
|
- files~=^vllm/model_executor/models/.*llama.*\.py
|
||||||
|
- files~=^vllm/transformers_utils/configs/.*llama.*\.py
|
||||||
|
actions:
|
||||||
|
label:
|
||||||
|
add:
|
||||||
|
- llama
|
||||||
|
|
||||||
- name: label-multi-modality
|
- name: label-multi-modality
|
||||||
description: Automatically apply multi-modality label
|
description: Automatically apply multi-modality label
|
||||||
conditions:
|
conditions:
|
||||||
|
|||||||
@@ -11,6 +11,8 @@ repos:
|
|||||||
hooks:
|
hooks:
|
||||||
- id: yapf
|
- id: yapf
|
||||||
args: [--in-place, --verbose]
|
args: [--in-place, --verbose]
|
||||||
|
# Keep the same list from yapfignore here to avoid yapf failing without any inputs
|
||||||
|
exclude: '(.buildkite|benchmarks|build|examples)/.*'
|
||||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||||
rev: v0.11.7
|
rev: v0.11.7
|
||||||
hooks:
|
hooks:
|
||||||
@@ -58,7 +60,7 @@ repos:
|
|||||||
entry: tools/mypy.sh 0 "local"
|
entry: tools/mypy.sh 0 "local"
|
||||||
language: python
|
language: python
|
||||||
types: [python]
|
types: [python]
|
||||||
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests]
|
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
|
||||||
stages: [pre-commit] # Don't run in CI
|
stages: [pre-commit] # Don't run in CI
|
||||||
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
name: Run mypy for Python 3.9
|
name: Run mypy for Python 3.9
|
||||||
|
|||||||
@@ -23,6 +23,9 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
|
|||||||
# Suppress potential warnings about unused manually-specified variables
|
# Suppress potential warnings about unused manually-specified variables
|
||||||
set(ignoreMe "${VLLM_PYTHON_PATH}")
|
set(ignoreMe "${VLLM_PYTHON_PATH}")
|
||||||
|
|
||||||
|
# Prevent installation of dependencies (cutlass) by default.
|
||||||
|
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
||||||
|
|
||||||
#
|
#
|
||||||
# Supported python versions. These versions will be searched in order, the
|
# Supported python versions. These versions will be searched in order, the
|
||||||
# first match will be selected. These should be kept in sync with setup.py.
|
# first match will be selected. These should be kept in sync with setup.py.
|
||||||
@@ -179,9 +182,6 @@ include(FetchContent)
|
|||||||
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
|
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
|
||||||
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
|
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
|
||||||
|
|
||||||
#
|
|
||||||
# Set rocm version dev int.
|
|
||||||
#
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "HIP")
|
if(VLLM_GPU_LANG STREQUAL "HIP")
|
||||||
#
|
#
|
||||||
# Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info
|
# Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info
|
||||||
@@ -189,7 +189,6 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
|
|||||||
set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
|
set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
|
||||||
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
|
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
|
# Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
|
||||||
# a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
|
# a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
|
||||||
@@ -243,6 +242,7 @@ set(VLLM_EXT_SRC
|
|||||||
"csrc/activation_kernels.cu"
|
"csrc/activation_kernels.cu"
|
||||||
"csrc/layernorm_kernels.cu"
|
"csrc/layernorm_kernels.cu"
|
||||||
"csrc/layernorm_quant_kernels.cu"
|
"csrc/layernorm_quant_kernels.cu"
|
||||||
|
"csrc/sampler.cu"
|
||||||
"csrc/cuda_view.cu"
|
"csrc/cuda_view.cu"
|
||||||
"csrc/quantization/gptq/q_gemm.cu"
|
"csrc/quantization/gptq/q_gemm.cu"
|
||||||
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
||||||
@@ -308,7 +308,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
|
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
|
||||||
# are not supported by Machete yet.
|
# are not supported by Machete yet.
|
||||||
# 9.0 for latest bf16 atomicAdd PTX
|
# 9.0 for latest bf16 atomicAdd PTX
|
||||||
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
|
||||||
if (MARLIN_ARCHS)
|
if (MARLIN_ARCHS)
|
||||||
|
|
||||||
#
|
#
|
||||||
@@ -454,7 +454,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# kernels for the remaining archs that are not already built for 3x.
|
# kernels for the remaining archs that are not already built for 3x.
|
||||||
# (Build 8.9 for FP8)
|
# (Build 8.9 for FP8)
|
||||||
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
|
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
|
||||||
"7.5;8.0;8.9+PTX" "${CUDA_ARCHS}")
|
"7.5;8.0;8.7;8.9+PTX" "${CUDA_ARCHS}")
|
||||||
# subtract out the archs that are already built for 3x
|
# subtract out the archs that are already built for 3x
|
||||||
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
||||||
if (SCALED_MM_2X_ARCHS)
|
if (SCALED_MM_2X_ARCHS)
|
||||||
@@ -543,8 +543,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# CUTLASS MoE kernels
|
# CUTLASS MoE kernels
|
||||||
|
|
||||||
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
|
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
|
||||||
# on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
|
# on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled
|
||||||
# to compile MoE kernels that use its output.
|
# if it's possible to compile MoE kernels that use its output.
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
|
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
|
||||||
@@ -684,7 +684,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
|
|
||||||
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
|
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
|
||||||
# 9.0 for latest bf16 atomicAdd PTX
|
# 9.0 for latest bf16 atomicAdd PTX
|
||||||
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
|
||||||
if (MARLIN_MOE_ARCHS)
|
if (MARLIN_MOE_ARCHS)
|
||||||
|
|
||||||
#
|
#
|
||||||
@@ -785,5 +785,7 @@ endif()
|
|||||||
# For CUDA we also build and ship some external projects.
|
# For CUDA we also build and ship some external projects.
|
||||||
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
include(cmake/external_projects/flashmla.cmake)
|
include(cmake/external_projects/flashmla.cmake)
|
||||||
|
|
||||||
|
# vllm-flash-attn should be last as it overwrites some CMake functions
|
||||||
include(cmake/external_projects/vllm_flash_attn.cmake)
|
include(cmake/external_projects/vllm_flash_attn.cmake)
|
||||||
endif ()
|
endif ()
|
||||||
|
|||||||
10
README.md
10
README.md
@@ -58,8 +58,8 @@ vLLM is fast with:
|
|||||||
- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
|
- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
|
||||||
- Continuous batching of incoming requests
|
- Continuous batching of incoming requests
|
||||||
- Fast model execution with CUDA/HIP graph
|
- Fast model execution with CUDA/HIP graph
|
||||||
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516),INT4, INT8, and FP8.
|
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516), INT4, INT8, and FP8
|
||||||
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
|
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer
|
||||||
- Speculative decoding
|
- Speculative decoding
|
||||||
- Chunked prefill
|
- Chunked prefill
|
||||||
|
|
||||||
@@ -72,14 +72,14 @@ vLLM is flexible and easy to use with:
|
|||||||
- Tensor parallelism and pipeline parallelism support for distributed inference
|
- Tensor parallelism and pipeline parallelism support for distributed inference
|
||||||
- Streaming outputs
|
- Streaming outputs
|
||||||
- OpenAI-compatible API server
|
- OpenAI-compatible API server
|
||||||
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron.
|
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron
|
||||||
- Prefix caching support
|
- Prefix caching support
|
||||||
- Multi-LoRA support
|
- Multi-LoRA support
|
||||||
|
|
||||||
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
|
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
|
||||||
- Transformer-like LLMs (e.g., Llama)
|
- Transformer-like LLMs (e.g., Llama)
|
||||||
- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
|
- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
|
||||||
- Embedding Models (e.g. E5-Mistral)
|
- Embedding Models (e.g., E5-Mistral)
|
||||||
- Multi-modal LLMs (e.g., LLaVA)
|
- Multi-modal LLMs (e.g., LLaVA)
|
||||||
|
|
||||||
Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
|
Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
|
||||||
@@ -162,4 +162,4 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
|
|||||||
|
|
||||||
## Media Kit
|
## Media Kit
|
||||||
|
|
||||||
- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).
|
- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit)
|
||||||
|
|||||||
@@ -8,4 +8,6 @@ Please report security issues privately using [the vulnerability submission form
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations.
|
||||||
|
|
||||||
Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
|
Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
|
||||||
|
|||||||
@@ -64,6 +64,12 @@ become available.
|
|||||||
<td style="text-align: center;">✅</td>
|
<td style="text-align: center;">✅</td>
|
||||||
<td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
|
<td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
|
||||||
</tr>
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><strong>Custom</strong></td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td>Local file: <code>data.jsonl</code></td>
|
||||||
|
</tr>
|
||||||
</tbody>
|
</tbody>
|
||||||
</table>
|
</table>
|
||||||
|
|
||||||
@@ -124,6 +130,38 @@ P99 ITL (ms): 8.39
|
|||||||
==================================================
|
==================================================
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Custom Dataset
|
||||||
|
If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
|
||||||
|
|
||||||
|
```
|
||||||
|
{"prompt": "What is the capital of India?"}
|
||||||
|
{"prompt": "What is the capital of Iran?"}
|
||||||
|
{"prompt": "What is the capital of China?"}
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# start server
|
||||||
|
VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# run benchmarking script
|
||||||
|
python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \
|
||||||
|
--backend vllm \
|
||||||
|
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||||
|
--endpoint /v1/completions \
|
||||||
|
--dataset-name custom \
|
||||||
|
--dataset-path <path-to-your-data-jsonl> \
|
||||||
|
--custom-skip-chat-template \
|
||||||
|
--num-prompts 80 \
|
||||||
|
--max-concurrency 1 \
|
||||||
|
--temperature=0.3 \
|
||||||
|
--top-p=0.75 \
|
||||||
|
--result-dir "./log/"
|
||||||
|
```
|
||||||
|
|
||||||
|
You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
|
||||||
|
|
||||||
### VisionArena Benchmark for Vision Language Models
|
### VisionArena Benchmark for Vision Language Models
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@@ -146,9 +184,9 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
|||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
|
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
|
||||||
--ngram_prompt_lookup_min 2 \
|
--speculative-config $'{"method": "ngram",
|
||||||
--ngram-prompt-lookup-max 5 \
|
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
|
||||||
--speculative_config '{"model": "[ngram]", "num_speculative_tokens": 5}
|
"prompt_lookup_min": 2}'
|
||||||
```
|
```
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
@@ -203,6 +241,16 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
|||||||
--seed 42
|
--seed 42
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**`philschmid/mt-bench`**
|
||||||
|
|
||||||
|
``` bash
|
||||||
|
python3 vllm/benchmarks/benchmark_serving.py \
|
||||||
|
--model Qwen/QwQ-32B \
|
||||||
|
--dataset-name hf \
|
||||||
|
--dataset-path philschmid/mt-bench \
|
||||||
|
--num-prompts 80
|
||||||
|
```
|
||||||
|
|
||||||
### Running With Sampling Parameters
|
### Running With Sampling Parameters
|
||||||
|
|
||||||
When using OpenAI-compatible backends such as `vllm`, optional sampling
|
When using OpenAI-compatible backends such as `vllm`, optional sampling
|
||||||
@@ -273,9 +321,9 @@ python3 vllm/benchmarks/benchmark_throughput.py \
|
|||||||
--output-len=100 \
|
--output-len=100 \
|
||||||
--num-prompts=2048 \
|
--num-prompts=2048 \
|
||||||
--async-engine \
|
--async-engine \
|
||||||
--ngram_prompt_lookup_min=2 \
|
--speculative-config $'{"method": "ngram",
|
||||||
--ngram-prompt-lookup-max=5 \
|
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
|
||||||
--speculative_config '{"model": "[ngram]", "num_speculative_tokens": 5}
|
"prompt_lookup_min": 2}'
|
||||||
```
|
```
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -10,11 +10,15 @@
|
|||||||
# 3. Set variables (ALL REQUIRED)
|
# 3. Set variables (ALL REQUIRED)
|
||||||
# BASE: your directory for vllm repo
|
# BASE: your directory for vllm repo
|
||||||
# MODEL: the model served by vllm
|
# MODEL: the model served by vllm
|
||||||
|
# TP: ways of tensor parallelism
|
||||||
# DOWNLOAD_DIR: directory to download and load model weights.
|
# DOWNLOAD_DIR: directory to download and load model weights.
|
||||||
# INPUT_LEN: request input len
|
# INPUT_LEN: request input len
|
||||||
# OUTPUT_LEN: request output len
|
# OUTPUT_LEN: request output len
|
||||||
# MIN_CACHE_HIT_PCT: prefix cache rate
|
# MIN_CACHE_HIT_PCT: prefix cache rate
|
||||||
# MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
|
# MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
|
||||||
|
# NUM_SEQS_LIST: a list of `max-num-seqs` you want to loop with.
|
||||||
|
# NUM_BATCHED_TOKENS_LIST: a list of `max-num-batched-tokens` you want to loop with.
|
||||||
|
# Note that the default NUM_SEQS_LIST and NUM_BATCHED_TOKENS_LIST are set for medium size input/output len, for extra short context (such as 20:20), you might need to include larger numbers in NUM_SEQS_LIST.
|
||||||
# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
|
# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
|
||||||
# 5. The final result will be saved in RESULT file.
|
# 5. The final result will be saved in RESULT file.
|
||||||
|
|
||||||
@@ -30,31 +34,27 @@
|
|||||||
TAG=$(date +"%Y_%m_%d_%H_%M")
|
TAG=$(date +"%Y_%m_%d_%H_%M")
|
||||||
BASE=""
|
BASE=""
|
||||||
MODEL="meta-llama/Llama-3.1-8B-Instruct"
|
MODEL="meta-llama/Llama-3.1-8B-Instruct"
|
||||||
|
TP=1
|
||||||
DOWNLOAD_DIR=""
|
DOWNLOAD_DIR=""
|
||||||
INPUT_LEN=4000
|
INPUT_LEN=4000
|
||||||
OUTPUT_LEN=16
|
OUTPUT_LEN=16
|
||||||
MIN_CACHE_HIT_PCT_PCT=0
|
MIN_CACHE_HIT_PCT=0
|
||||||
MAX_LATENCY_ALLOWED_MS=100000000000
|
MAX_LATENCY_ALLOWED_MS=100000000000
|
||||||
|
NUM_SEQS_LIST="128 256"
|
||||||
|
NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
|
||||||
|
|
||||||
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
|
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
|
||||||
RESULT="$LOG_FOLDER/result.txt"
|
RESULT="$LOG_FOLDER/result.txt"
|
||||||
|
|
||||||
echo "result file$ $RESULT"
|
echo "result file: $RESULT"
|
||||||
echo "model: $MODEL"
|
echo "model: $MODEL"
|
||||||
echo
|
|
||||||
|
|
||||||
rm -rf $LOG_FOLDER
|
rm -rf $LOG_FOLDER
|
||||||
mkdir -p $LOG_FOLDER
|
mkdir -p $LOG_FOLDER
|
||||||
|
|
||||||
cd "$BASE/vllm"
|
cd "$BASE/vllm"
|
||||||
# create sonnet-4x.txt so that we can sample 2048 tokens for input
|
|
||||||
echo "" > benchmarks/sonnet_4x.txt
|
|
||||||
for _ in {1..4}
|
|
||||||
do
|
|
||||||
cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
|
|
||||||
done
|
|
||||||
|
|
||||||
pip install datasets
|
pip install -q datasets
|
||||||
|
|
||||||
current_hash=$(git rev-parse HEAD)
|
current_hash=$(git rev-parse HEAD)
|
||||||
echo "hash:$current_hash" >> "$RESULT"
|
echo "hash:$current_hash" >> "$RESULT"
|
||||||
@@ -64,53 +64,69 @@ best_throughput=0
|
|||||||
best_max_num_seqs=0
|
best_max_num_seqs=0
|
||||||
best_num_batched_tokens=0
|
best_num_batched_tokens=0
|
||||||
best_goodput=0
|
best_goodput=0
|
||||||
|
|
||||||
|
start_server() {
|
||||||
|
local gpu_memory_utilization=$1
|
||||||
|
local max_num_seqs=$2
|
||||||
|
local max_num_batched_tokens=$3
|
||||||
|
local vllm_log=$4
|
||||||
|
|
||||||
|
pkill -f vllm
|
||||||
|
|
||||||
|
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
|
||||||
|
--disable-log-requests \
|
||||||
|
--port 8004 \
|
||||||
|
--gpu-memory-utilization $gpu_memory_utilization \
|
||||||
|
--max-num-seqs $max_num_seqs \
|
||||||
|
--max-num-batched-tokens $max_num_batched_tokens \
|
||||||
|
--tensor-parallel-size $TP \
|
||||||
|
--enable-prefix-caching \
|
||||||
|
--load-format dummy \
|
||||||
|
--download-dir "$DOWNLOAD_DIR" \
|
||||||
|
--max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
|
||||||
|
|
||||||
|
# wait for 10 minutes...
|
||||||
|
server_started=0
|
||||||
|
for i in {1..60}; do
|
||||||
|
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
|
||||||
|
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
|
||||||
|
if [[ "$STATUS_CODE" -eq 200 ]]; then
|
||||||
|
server_started=1
|
||||||
|
break
|
||||||
|
else
|
||||||
|
sleep 10
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
if (( ! server_started )); then
|
||||||
|
echo "server did not start within 10 minutes. Please check server log at $vllm_log".
|
||||||
|
return 1
|
||||||
|
else
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
run_benchmark() {
|
run_benchmark() {
|
||||||
local max_num_seqs=$1
|
local max_num_seqs=$1
|
||||||
local max_num_batched_tokens=$2
|
local max_num_batched_tokens=$2
|
||||||
|
local gpu_memory_utilization=$3
|
||||||
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
|
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
|
||||||
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
|
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
|
||||||
echo "vllm_log: $vllm_log"
|
echo "vllm_log: $vllm_log"
|
||||||
echo
|
echo
|
||||||
rm -f $vllm_log
|
rm -f $vllm_log
|
||||||
|
pkill -f vllm
|
||||||
|
|
||||||
# start the server
|
echo "starting server..."
|
||||||
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
|
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log
|
||||||
--disable-log-requests \
|
result=$?
|
||||||
--port 8004 \
|
if [[ "$result" -eq 1 ]]; then
|
||||||
--gpu-memory-utilization 0.98 \
|
echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
|
||||||
--max-num-seqs $max_num_seqs \
|
else
|
||||||
--max-num-batched-tokens $max_num_batched_tokens \
|
echo "server started."
|
||||||
--tensor-parallel-size 1 \
|
|
||||||
--enable-prefix-caching \
|
|
||||||
--load-format dummy \
|
|
||||||
--download-dir $DOWNLOAD_DIR \
|
|
||||||
--max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
|
|
||||||
echo "wait for 10 minutes.."
|
|
||||||
echo
|
|
||||||
# wait for 10 minutes...
|
|
||||||
server_started=0
|
|
||||||
for i in {1..60}; do
|
|
||||||
if grep -Fq "Application startup complete" "$vllm_log"; then
|
|
||||||
echo "Application started"
|
|
||||||
server_started=1
|
|
||||||
break
|
|
||||||
else
|
|
||||||
# echo "wait for 10 seconds..."
|
|
||||||
sleep 10
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
if (( ! server_started )); then
|
|
||||||
echo "server did not start within 10 minutes, terminate the benchmarking. Please check server log at $vllm_log"
|
|
||||||
echo "pkill -f vllm"
|
|
||||||
echo
|
|
||||||
pkill vllm
|
|
||||||
sleep 10
|
|
||||||
return 1
|
|
||||||
fi
|
fi
|
||||||
|
echo
|
||||||
|
|
||||||
echo "run benchmark test..."
|
echo "run benchmark test..."
|
||||||
echo
|
|
||||||
meet_latency_requirement=0
|
meet_latency_requirement=0
|
||||||
# get a basic qps by using request-rate inf
|
# get a basic qps by using request-rate inf
|
||||||
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
|
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
|
||||||
@@ -118,29 +134,29 @@ run_benchmark() {
|
|||||||
python benchmarks/benchmark_serving.py \
|
python benchmarks/benchmark_serving.py \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model $MODEL \
|
--model $MODEL \
|
||||||
--dataset-name sonnet \
|
--dataset-name random \
|
||||||
--dataset-path benchmarks/sonnet_4x.txt \
|
--random-input-len $INPUT_LEN \
|
||||||
--sonnet-input-len $INPUT_LEN \
|
--random-output-len $OUTPUT_LEN \
|
||||||
--sonnet-output-len $OUTPUT_LEN \
|
|
||||||
--ignore-eos \
|
--ignore-eos \
|
||||||
--disable-tqdm \
|
--disable-tqdm \
|
||||||
--request-rate inf \
|
--request-rate inf \
|
||||||
--percentile-metrics ttft,tpot,itl,e2el \
|
--percentile-metrics ttft,tpot,itl,e2el \
|
||||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||||
--num-prompts 100 \
|
--num-prompts 1000 \
|
||||||
--sonnet-prefix-len $prefix_len \
|
--random-prefix-len $prefix_len \
|
||||||
--port 8004 > "$bm_log"
|
--port 8004 &> "$bm_log"
|
||||||
through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||||
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
||||||
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||||
|
|
||||||
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
|
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
|
||||||
meet_latency_requirement=1
|
meet_latency_requirement=1
|
||||||
|
request_rate=inf
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if (( ! meet_latency_requirement )); then
|
if (( ! meet_latency_requirement )); then
|
||||||
# start from request-rate as int(through_put) + 1
|
# start from request-rate as int(throughput) + 1
|
||||||
request_rate=$((${through_put%.*} + 1))
|
request_rate=$((${throughput%.*} + 1))
|
||||||
while ((request_rate > 0)); do
|
while ((request_rate > 0)); do
|
||||||
# clear prefix cache
|
# clear prefix cache
|
||||||
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
|
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
|
||||||
@@ -149,19 +165,18 @@ run_benchmark() {
|
|||||||
python benchmarks/benchmark_serving.py \
|
python benchmarks/benchmark_serving.py \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model $MODEL \
|
--model $MODEL \
|
||||||
--dataset-name sonnet \
|
--dataset-name random \
|
||||||
--dataset-path benchmarks/sonnet_4x.txt \
|
--random-input-len $INPUT_LEN \
|
||||||
--sonnet-input-len $INPUT_LEN \
|
--random-output-len $OUTPUT_LEN \
|
||||||
--sonnet-output-len $OUTPUT_LEN \
|
--ignore-eos \
|
||||||
--ignore_eos \
|
|
||||||
--disable-tqdm \
|
--disable-tqdm \
|
||||||
--request-rate $request_rate \
|
--request-rate $request_rate \
|
||||||
--percentile-metrics ttft,tpot,itl,e2el \
|
--percentile-metrics ttft,tpot,itl,e2el \
|
||||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||||
--num-prompts 100 \
|
--num-prompts 100 \
|
||||||
--sonnet-prefix-len $prefix_len \
|
--random-prefix-len $prefix_len \
|
||||||
--port 8004 > "$bm_log"
|
--port 8004 &> "$bm_log"
|
||||||
through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||||
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
||||||
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||||
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
|
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
|
||||||
@@ -173,10 +188,10 @@ run_benchmark() {
|
|||||||
fi
|
fi
|
||||||
# write the results and update the best result.
|
# write the results and update the best result.
|
||||||
if ((meet_latency_requirement)); then
|
if ((meet_latency_requirement)); then
|
||||||
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput"
|
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput"
|
||||||
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput" >> "$RESULT"
|
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput" >> "$RESULT"
|
||||||
if (( $(echo "$through_put > $best_throughput" | bc -l) )); then
|
if (( $(echo "$throughput > $best_throughput" | bc -l) )); then
|
||||||
best_throughput=$through_put
|
best_throughput=$throughput
|
||||||
best_max_num_seqs=$max_num_seqs
|
best_max_num_seqs=$max_num_seqs
|
||||||
best_num_batched_tokens=$max_num_batched_tokens
|
best_num_batched_tokens=$max_num_batched_tokens
|
||||||
best_goodput=$goodput
|
best_goodput=$goodput
|
||||||
@@ -188,22 +203,39 @@ run_benchmark() {
|
|||||||
|
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
|
||||||
|
|
||||||
echo "pkill -f vllm"
|
|
||||||
echo
|
|
||||||
pkill vllm
|
pkill vllm
|
||||||
sleep 10
|
sleep 10
|
||||||
rm -f $vllm_log
|
|
||||||
printf '=%.0s' $(seq 1 20)
|
printf '=%.0s' $(seq 1 20)
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
read -r -a num_seqs_list <<< "$NUM_SEQS_LIST"
|
||||||
|
read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST"
|
||||||
|
|
||||||
num_seqs_list="128 256"
|
# first find out the max gpu-memory-utilization without HBM OOM.
|
||||||
num_batched_tokens_list="512 1024 2048 4096"
|
gpu_memory_utilization=0.98
|
||||||
for num_seqs in $num_seqs_list; do
|
find_gpu_memory_utilization=0
|
||||||
for num_batched_tokens in $num_batched_tokens_list; do
|
while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
|
||||||
run_benchmark $num_seqs $num_batched_tokens
|
start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log"
|
||||||
exit 0
|
result=$?
|
||||||
|
if [[ "$result" -eq 0 ]]; then
|
||||||
|
find_gpu_memory_utilization=1
|
||||||
|
break
|
||||||
|
else
|
||||||
|
gpu_memory_utilization=$(echo "$gpu_memory_utilization - 0.01" | bc)
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ "$find_gpu_memory_utilization" -eq 1 ]]; then
|
||||||
|
echo "Using gpu_memory_utilization=$gpu_memory_utilization to serve model."
|
||||||
|
else
|
||||||
|
echo "Cannot find a proper gpu_memory_utilization over 0.9 to serve the model, please check logs in $LOG_FOLDER."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
for num_seqs in "${num_seqs_list[@]}"; do
|
||||||
|
for num_batched_tokens in "${num_batched_tokens_list[@]}"; do
|
||||||
|
run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization
|
||||||
done
|
done
|
||||||
done
|
done
|
||||||
echo "finish permutations"
|
echo "finish permutations"
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
@@ -324,7 +325,7 @@ async def async_request_openai_completions(
|
|||||||
|
|
||||||
most_recent_timestamp = timestamp
|
most_recent_timestamp = timestamp
|
||||||
generated_text += text or ""
|
generated_text += text or ""
|
||||||
elif usage := data.get("usage"):
|
if usage := data.get("usage"):
|
||||||
output.output_tokens = usage.get("completion_tokens")
|
output.output_tokens = usage.get("completion_tokens")
|
||||||
if first_chunk_received:
|
if first_chunk_received:
|
||||||
output.success = True
|
output.success = True
|
||||||
@@ -611,6 +612,7 @@ ASYNC_REQUEST_FUNCS = {
|
|||||||
"tensorrt-llm": async_request_trt_llm,
|
"tensorrt-llm": async_request_trt_llm,
|
||||||
"scalellm": async_request_openai_completions,
|
"scalellm": async_request_openai_completions,
|
||||||
"sglang": async_request_openai_completions,
|
"sglang": async_request_openai_completions,
|
||||||
|
"llama.cpp": async_request_openai_completions,
|
||||||
}
|
}
|
||||||
|
|
||||||
OPENAI_COMPATIBLE_BACKENDS = [
|
OPENAI_COMPATIBLE_BACKENDS = [
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
"""
|
"""
|
||||||
This module defines a framework for sampling benchmark requests from various
|
This module defines a framework for sampling benchmark requests from various
|
||||||
datasets. Each dataset subclass of BenchmarkDataset must implement sample
|
datasets. Each dataset subclass of BenchmarkDataset must implement sample
|
||||||
@@ -9,9 +10,6 @@ generation. Supported dataset types include:
|
|||||||
- BurstGPT
|
- BurstGPT
|
||||||
- HuggingFace
|
- HuggingFace
|
||||||
- VisionArena
|
- VisionArena
|
||||||
|
|
||||||
TODO: Implement CustomDataset to parse a JSON file and convert its contents into
|
|
||||||
SampleRequest instances, similar to the approach used in ShareGPT.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import base64
|
import base64
|
||||||
@@ -442,6 +440,97 @@ class ShareGPTDataset(BenchmarkDataset):
|
|||||||
return samples
|
return samples
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Custom Dataset Implementation
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class CustomDataset(BenchmarkDataset):
|
||||||
|
"""
|
||||||
|
Implements the Custom dataset. Loads data from a JSONL file and generates
|
||||||
|
sample requests based on conversation turns. E.g.,
|
||||||
|
```
|
||||||
|
{"prompt": "What is the capital of India?"}
|
||||||
|
{"prompt": "What is the capital of Iran?"}
|
||||||
|
{"prompt": "What is the capital of China?"}
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, **kwargs) -> None:
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
self.load_data()
|
||||||
|
|
||||||
|
def load_data(self) -> None:
|
||||||
|
if self.dataset_path is None:
|
||||||
|
raise ValueError("dataset_path must be provided for loading data.")
|
||||||
|
|
||||||
|
# self.data will be a list of dictionaries
|
||||||
|
# e.g., [{"prompt": "What is the capital of India?"}, ...]
|
||||||
|
# This will be the standardized format which load_data()
|
||||||
|
# has to convert into depending on the filetype of dataset_path.
|
||||||
|
# sample() will assume this standardized format of self.data
|
||||||
|
self.data = []
|
||||||
|
|
||||||
|
# Load the JSONL file
|
||||||
|
if self.dataset_path.endswith(".jsonl"):
|
||||||
|
jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True)
|
||||||
|
|
||||||
|
# check if the JSONL file has a 'prompt' column
|
||||||
|
if "prompt" not in jsonl_data.columns:
|
||||||
|
raise ValueError("JSONL file must contain a 'prompt' column.")
|
||||||
|
|
||||||
|
# Convert each row to a dictionary and append to self.data
|
||||||
|
# This will convert the DataFrame to a list of dictionaries
|
||||||
|
# where each dictionary corresponds to a row in the DataFrame.
|
||||||
|
# This is the standardized format we want for self.data
|
||||||
|
for _, row in jsonl_data.iterrows():
|
||||||
|
self.data.append(row.to_dict())
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"Only JSONL format is supported for CustomDataset."
|
||||||
|
)
|
||||||
|
|
||||||
|
random.seed(self.random_seed)
|
||||||
|
random.shuffle(self.data)
|
||||||
|
|
||||||
|
def sample(
|
||||||
|
self,
|
||||||
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
|
num_requests: int,
|
||||||
|
lora_path: Optional[str] = None,
|
||||||
|
max_loras: Optional[int] = None,
|
||||||
|
output_len: Optional[int] = None,
|
||||||
|
enable_multimodal_chat: bool = False,
|
||||||
|
skip_chat_template: bool = False,
|
||||||
|
**kwargs,
|
||||||
|
) -> list:
|
||||||
|
sampled_requests = []
|
||||||
|
for item in self.data:
|
||||||
|
if len(sampled_requests) >= num_requests:
|
||||||
|
break
|
||||||
|
prompt = item["prompt"]
|
||||||
|
|
||||||
|
# apply template
|
||||||
|
if not skip_chat_template:
|
||||||
|
prompt = tokenizer.apply_chat_template(
|
||||||
|
[{"role": "user", "content": prompt}],
|
||||||
|
add_generation_prompt=True,
|
||||||
|
tokenize=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
prompt_len = len(tokenizer(prompt).input_ids)
|
||||||
|
sampled_requests.append(
|
||||||
|
SampleRequest(
|
||||||
|
prompt=prompt,
|
||||||
|
prompt_len=prompt_len,
|
||||||
|
expected_output_len=output_len,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
self.maybe_oversample_requests(sampled_requests, num_requests)
|
||||||
|
|
||||||
|
return sampled_requests
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# Sonnet Dataset Implementation
|
# Sonnet Dataset Implementation
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
@@ -776,7 +865,15 @@ class InstructCoderDataset(HuggingFaceDataset):
|
|||||||
for item in self.data:
|
for item in self.data:
|
||||||
if len(sampled_requests) >= num_requests:
|
if len(sampled_requests) >= num_requests:
|
||||||
break
|
break
|
||||||
prompt = f"{item['instruction']}:\n{item['input']}"
|
prompt = f"{item['input']}\n\n{item['instruction']} Just output \
|
||||||
|
the code, do not include any explanation."
|
||||||
|
|
||||||
|
# apply template
|
||||||
|
prompt = tokenizer.apply_chat_template(
|
||||||
|
[{"role": "user", "content": prompt}],
|
||||||
|
add_generation_prompt=True,
|
||||||
|
tokenize=False,
|
||||||
|
)
|
||||||
prompt_len = len(tokenizer(prompt).input_ids)
|
prompt_len = len(tokenizer(prompt).input_ids)
|
||||||
sampled_requests.append(
|
sampled_requests.append(
|
||||||
SampleRequest(
|
SampleRequest(
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
"""Benchmark the latency of processing a single batch of requests."""
|
"""Benchmark the latency of processing a single batch of requests."""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
@@ -6,13 +7,12 @@ import dataclasses
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
import vllm.envs as envs
|
||||||
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
@@ -80,17 +80,9 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
def run_to_completion(profile_dir: Optional[str] = None):
|
def run_to_completion(profile_dir: Optional[str] = None):
|
||||||
if profile_dir:
|
if profile_dir:
|
||||||
with torch.profiler.profile(
|
llm.start_profile()
|
||||||
activities=[
|
llm_generate()
|
||||||
torch.profiler.ProfilerActivity.CPU,
|
llm.stop_profile()
|
||||||
torch.profiler.ProfilerActivity.CUDA,
|
|
||||||
],
|
|
||||||
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
|
||||||
str(profile_dir)
|
|
||||||
),
|
|
||||||
) as p:
|
|
||||||
llm_generate()
|
|
||||||
print(p.key_averages().table(sort_by="self_cuda_time_total"))
|
|
||||||
else:
|
else:
|
||||||
start_time = time.perf_counter()
|
start_time = time.perf_counter()
|
||||||
llm_generate()
|
llm_generate()
|
||||||
@@ -103,11 +95,7 @@ def main(args: argparse.Namespace):
|
|||||||
run_to_completion(profile_dir=None)
|
run_to_completion(profile_dir=None)
|
||||||
|
|
||||||
if args.profile:
|
if args.profile:
|
||||||
profile_dir = args.profile_result_dir
|
profile_dir = envs.VLLM_TORCH_PROFILER_DIR
|
||||||
if not profile_dir:
|
|
||||||
profile_dir = (
|
|
||||||
Path(".") / "vllm_benchmark_result" / f"latency_result_{time.time()}"
|
|
||||||
)
|
|
||||||
print(f"Profiling (results will be saved to '{profile_dir}')...")
|
print(f"Profiling (results will be saved to '{profile_dir}')...")
|
||||||
run_to_completion(profile_dir=profile_dir)
|
run_to_completion(profile_dir=profile_dir)
|
||||||
return
|
return
|
||||||
@@ -164,15 +152,6 @@ if __name__ == "__main__":
|
|||||||
action="store_true",
|
action="store_true",
|
||||||
help="profile the generation process of a single batch",
|
help="profile the generation process of a single batch",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
|
||||||
"--profile-result-dir",
|
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
help=(
|
|
||||||
"path to save the pytorch profiler output. Can be visualized "
|
|
||||||
"with ui.perfetto.dev or Tensorboard."
|
|
||||||
),
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--output-json",
|
"--output-json",
|
||||||
type=str,
|
type=str,
|
||||||
@@ -193,4 +172,9 @@ if __name__ == "__main__":
|
|||||||
# numbers. We need to disable prefix caching by default.
|
# numbers. We need to disable prefix caching by default.
|
||||||
parser.set_defaults(enable_prefix_caching=False)
|
parser.set_defaults(enable_prefix_caching=False)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
|
||||||
|
raise OSError(
|
||||||
|
"The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
|
||||||
|
"Please set it to a valid path to use torch profiler."
|
||||||
|
)
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
"""
|
"""
|
||||||
Offline benchmark to test the long document QA throughput.
|
Offline benchmark to test the long document QA throughput.
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
"""
|
"""
|
||||||
Benchmark the efficiency of prefix caching.
|
Benchmark the efficiency of prefix caching.
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
"""Benchmark offline prioritization."""
|
"""Benchmark offline prioritization."""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
r"""Benchmark online serving throughput.
|
r"""Benchmark online serving throughput.
|
||||||
|
|
||||||
On the server side, run one of the following commands:
|
On the server side, run one of the following commands:
|
||||||
@@ -60,6 +61,7 @@ from benchmark_dataset import (
|
|||||||
ASRDataset,
|
ASRDataset,
|
||||||
BurstGPTDataset,
|
BurstGPTDataset,
|
||||||
ConversationDataset,
|
ConversationDataset,
|
||||||
|
CustomDataset,
|
||||||
HuggingFaceDataset,
|
HuggingFaceDataset,
|
||||||
InstructCoderDataset,
|
InstructCoderDataset,
|
||||||
MTBenchDataset,
|
MTBenchDataset,
|
||||||
@@ -627,7 +629,16 @@ def main(args: argparse.Namespace):
|
|||||||
"'--dataset-path' if required."
|
"'--dataset-path' if required."
|
||||||
)
|
)
|
||||||
|
|
||||||
if args.dataset_name == "sonnet":
|
if args.dataset_name == "custom":
|
||||||
|
dataset = CustomDataset(dataset_path=args.dataset_path)
|
||||||
|
input_requests = dataset.sample(
|
||||||
|
num_requests=args.num_prompts,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
output_len=args.custom_output_len,
|
||||||
|
skip_chat_template=args.custom_skip_chat_template,
|
||||||
|
)
|
||||||
|
|
||||||
|
elif args.dataset_name == "sonnet":
|
||||||
dataset = SonnetDataset(dataset_path=args.dataset_path)
|
dataset = SonnetDataset(dataset_path=args.dataset_path)
|
||||||
# For the "sonnet" dataset, formatting depends on the backend.
|
# For the "sonnet" dataset, formatting depends on the backend.
|
||||||
if args.backend == "openai-chat":
|
if args.backend == "openai-chat":
|
||||||
@@ -762,6 +773,10 @@ def main(args: argparse.Namespace):
|
|||||||
if "temperature" not in sampling_params:
|
if "temperature" not in sampling_params:
|
||||||
sampling_params["temperature"] = 0.0 # Default to greedy decoding.
|
sampling_params["temperature"] = 0.0 # Default to greedy decoding.
|
||||||
|
|
||||||
|
if args.backend == "llama.cpp":
|
||||||
|
# Disable prompt caching in llama.cpp backend
|
||||||
|
sampling_params["cache_prompt"] = False
|
||||||
|
|
||||||
# Avoid GC processing "static" data - reduce pause times.
|
# Avoid GC processing "static" data - reduce pause times.
|
||||||
gc.collect()
|
gc.collect()
|
||||||
gc.freeze()
|
gc.freeze()
|
||||||
@@ -834,6 +849,8 @@ def main(args: argparse.Namespace):
|
|||||||
]:
|
]:
|
||||||
if field in result_json:
|
if field in result_json:
|
||||||
del result_json[field]
|
del result_json[field]
|
||||||
|
if field in benchmark_result:
|
||||||
|
del benchmark_result[field]
|
||||||
|
|
||||||
# Save to file
|
# Save to file
|
||||||
base_model_id = model_id.split("/")[-1]
|
base_model_id = model_id.split("/")[-1]
|
||||||
@@ -846,6 +863,7 @@ def main(args: argparse.Namespace):
|
|||||||
if args.result_filename:
|
if args.result_filename:
|
||||||
file_name = args.result_filename
|
file_name = args.result_filename
|
||||||
if args.result_dir:
|
if args.result_dir:
|
||||||
|
os.makedirs(args.result_dir, exist_ok=True)
|
||||||
file_name = os.path.join(args.result_dir, file_name)
|
file_name = os.path.join(args.result_dir, file_name)
|
||||||
with open(
|
with open(
|
||||||
file_name, mode="a+" if args.append_result else "w", encoding="utf-8"
|
file_name, mode="a+" if args.append_result else "w", encoding="utf-8"
|
||||||
@@ -886,7 +904,7 @@ if __name__ == "__main__":
|
|||||||
"--dataset-name",
|
"--dataset-name",
|
||||||
type=str,
|
type=str,
|
||||||
default="sharegpt",
|
default="sharegpt",
|
||||||
choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"],
|
choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"],
|
||||||
help="Name of the dataset to benchmark on.",
|
help="Name of the dataset to benchmark on.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@@ -1056,6 +1074,19 @@ if __name__ == "__main__":
|
|||||||
)
|
)
|
||||||
|
|
||||||
# group for dataset specific arguments
|
# group for dataset specific arguments
|
||||||
|
custom_group = parser.add_argument_group("custom dataset options")
|
||||||
|
custom_group.add_argument(
|
||||||
|
"--custom-output-len",
|
||||||
|
type=int,
|
||||||
|
default=256,
|
||||||
|
help="Number of output tokens per request, used only for custom dataset.",
|
||||||
|
)
|
||||||
|
custom_group.add_argument(
|
||||||
|
"--custom-skip-chat-template",
|
||||||
|
action="store_true",
|
||||||
|
help="Skip applying chat template to prompt, used only for custom dataset.",
|
||||||
|
)
|
||||||
|
|
||||||
sonnet_group = parser.add_argument_group("sonnet dataset options")
|
sonnet_group = parser.add_argument_group("sonnet dataset options")
|
||||||
sonnet_group.add_argument(
|
sonnet_group.add_argument(
|
||||||
"--sonnet-input-len",
|
"--sonnet-input-len",
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
r"""Benchmark online serving throughput with structured outputs.
|
r"""Benchmark online serving throughput with structured outputs.
|
||||||
|
|
||||||
On the server side, run one of the following commands:
|
On the server side, run one of the following commands:
|
||||||
@@ -11,7 +12,6 @@ On the client side, run:
|
|||||||
--model <your_model> \
|
--model <your_model> \
|
||||||
--dataset json \
|
--dataset json \
|
||||||
--structured-output-ratio 1.0 \
|
--structured-output-ratio 1.0 \
|
||||||
--structured-output-backend auto \
|
|
||||||
--request-rate 10 \
|
--request-rate 10 \
|
||||||
--num-prompts 1000
|
--num-prompts 1000
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
"""Benchmark offline inference throughput."""
|
"""Benchmark offline inference throughput."""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
@@ -65,4 +66,9 @@ class InfEncoder(json.JSONEncoder):
|
|||||||
|
|
||||||
def write_to_json(filename: str, records: list) -> None:
|
def write_to_json(filename: str, records: list) -> None:
|
||||||
with open(filename, "w") as f:
|
with open(filename, "w") as f:
|
||||||
json.dump(records, f, cls=InfEncoder)
|
json.dump(
|
||||||
|
records,
|
||||||
|
f,
|
||||||
|
cls=InfEncoder,
|
||||||
|
default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
|
||||||
|
)
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import copy
|
import copy
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
# Cutlass bench utils
|
# Cutlass bench utils
|
||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import copy
|
import copy
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
# Weight Shapes are in the format
|
# Weight Shapes are in the format
|
||||||
# ([K, N], TP_SPLIT_DIM)
|
# ([K, N], TP_SPLIT_DIM)
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import itertools
|
import itertools
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import pickle as pkl
|
import pickle as pkl
|
||||||
import time
|
import time
|
||||||
|
|||||||
223
benchmarks/kernels/bench_fp8_gemm.py
Normal file
223
benchmarks/kernels/bench_fp8_gemm.py
Normal file
@@ -0,0 +1,223 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
import argparse
|
||||||
|
import copy
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from weight_shapes import WEIGHT_SHAPES
|
||||||
|
|
||||||
|
from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
|
||||||
|
from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
|
||||||
|
from vllm.triton_utils import triton
|
||||||
|
|
||||||
|
|
||||||
|
@triton.testing.perf_report(
|
||||||
|
triton.testing.Benchmark(
|
||||||
|
x_names=["batch_size"],
|
||||||
|
x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
|
||||||
|
x_log=False,
|
||||||
|
line_arg="provider",
|
||||||
|
line_vals=[
|
||||||
|
"torch-bf16",
|
||||||
|
# "fp8-tensor-w-token-a",
|
||||||
|
"fp8-tensor-w-tensor-a",
|
||||||
|
"fp8-channel-w-token-a",
|
||||||
|
# "fp8-channel-w-tensor-a",
|
||||||
|
# "fp8-tensor-w-token-a-noquant",
|
||||||
|
"fp8-tensor-w-tensor-a-noquant",
|
||||||
|
"fp8-channel-w-token-a-noquant",
|
||||||
|
# "fp8-channel-w-tensor-a-noquant",
|
||||||
|
],
|
||||||
|
line_names=[
|
||||||
|
"torch-bf16",
|
||||||
|
# "fp8-tensor-w-token-a",
|
||||||
|
"fp8-tensor-w-tensor-a",
|
||||||
|
"fp8-channel-w-token-a",
|
||||||
|
# "fp8-channel-w-tensor-a",
|
||||||
|
# "fp8-tensor-w-token-a-noquant",
|
||||||
|
"fp8-tensor-w-tensor-a-noquant",
|
||||||
|
"fp8-channel-w-token-a-noquant",
|
||||||
|
# "fp8-channel-w-tensor-a-noquant",
|
||||||
|
],
|
||||||
|
ylabel="TFLOP/s (larger is better)",
|
||||||
|
plot_name="BF16 vs FP8 GEMMs",
|
||||||
|
args={},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
def benchmark(batch_size, provider, N, K):
|
||||||
|
M = batch_size
|
||||||
|
device = "cuda"
|
||||||
|
dtype = torch.bfloat16
|
||||||
|
|
||||||
|
# Create input tensors
|
||||||
|
a = torch.randn((M, K), device=device, dtype=dtype)
|
||||||
|
b = torch.randn((N, K), device=device, dtype=dtype)
|
||||||
|
|
||||||
|
quantiles = [0.5, 0.2, 0.8]
|
||||||
|
|
||||||
|
if "torch-bf16" in provider:
|
||||||
|
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||||
|
lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
|
||||||
|
)
|
||||||
|
|
||||||
|
elif "fp8" in provider:
|
||||||
|
# Weights are always quantized ahead of time
|
||||||
|
if "noquant" in provider:
|
||||||
|
# For no quantization, we just measure the GEMM
|
||||||
|
if "tensor-w-token-a" in provider:
|
||||||
|
# Dynamic per-token quant for A, per-tensor quant for B
|
||||||
|
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b)
|
||||||
|
assert scale_b_fp8.numel() == 1
|
||||||
|
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
|
||||||
|
a, use_per_token_if_dynamic=True
|
||||||
|
)
|
||||||
|
|
||||||
|
def run_quant():
|
||||||
|
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||||
|
|
||||||
|
elif "tensor-w-tensor-a" in provider:
|
||||||
|
# Static per-tensor quantization with fixed scales
|
||||||
|
# for both A and B
|
||||||
|
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
|
||||||
|
scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
|
||||||
|
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
||||||
|
assert scale_b_fp8.numel() == 1
|
||||||
|
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
|
||||||
|
|
||||||
|
def run_quant():
|
||||||
|
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||||
|
|
||||||
|
elif "channel-w-token-a" in provider:
|
||||||
|
# Static per-channel quantization for weights, per-token
|
||||||
|
# quant for A
|
||||||
|
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
|
||||||
|
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
||||||
|
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
|
||||||
|
assert scale_b_fp8.numel() == N
|
||||||
|
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
|
||||||
|
a, use_per_token_if_dynamic=True
|
||||||
|
)
|
||||||
|
|
||||||
|
def run_quant():
|
||||||
|
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||||
|
|
||||||
|
elif "channel-w-tensor-a" in provider:
|
||||||
|
# Static per-channel quantization for weights, per-tensor
|
||||||
|
# quant for A
|
||||||
|
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
|
||||||
|
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
|
||||||
|
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
||||||
|
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
|
||||||
|
assert scale_b_fp8.numel() == N
|
||||||
|
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
|
||||||
|
|
||||||
|
def run_quant():
|
||||||
|
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||||
|
|
||||||
|
else:
|
||||||
|
# In these cases, we quantize the activations during the GEMM call
|
||||||
|
if "tensor-w-token-a" in provider:
|
||||||
|
# Dynamic per-token quant for A, per-tensor quant for B
|
||||||
|
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b)
|
||||||
|
assert scale_b_fp8.numel() == 1
|
||||||
|
|
||||||
|
def run_quant():
|
||||||
|
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
|
||||||
|
a, use_per_token_if_dynamic=True
|
||||||
|
)
|
||||||
|
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||||
|
|
||||||
|
elif "tensor-w-tensor-a" in provider:
|
||||||
|
# Static per-tensor quantization with fixed scales
|
||||||
|
# for both A and B
|
||||||
|
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
|
||||||
|
scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
|
||||||
|
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
||||||
|
assert scale_b_fp8.numel() == 1
|
||||||
|
|
||||||
|
def run_quant():
|
||||||
|
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
|
||||||
|
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||||
|
|
||||||
|
elif "channel-w-token-a" in provider:
|
||||||
|
# Static per-channel quantization for weights, per-token
|
||||||
|
# quant for A
|
||||||
|
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
|
||||||
|
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
||||||
|
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
|
||||||
|
assert scale_b_fp8.numel() == N
|
||||||
|
|
||||||
|
def run_quant():
|
||||||
|
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
|
||||||
|
a, use_per_token_if_dynamic=True
|
||||||
|
)
|
||||||
|
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||||
|
|
||||||
|
elif "channel-w-tensor-a" in provider:
|
||||||
|
# Static per-channel quantization for weights, per-tensor
|
||||||
|
# quant for A
|
||||||
|
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
|
||||||
|
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
|
||||||
|
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
||||||
|
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
|
||||||
|
assert scale_b_fp8.numel() == N
|
||||||
|
|
||||||
|
def run_quant():
|
||||||
|
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
|
||||||
|
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||||
|
|
||||||
|
b_fp8 = b_fp8.t()
|
||||||
|
|
||||||
|
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||||
|
lambda: run_quant(), quantiles=quantiles
|
||||||
|
)
|
||||||
|
|
||||||
|
# Calculate TFLOP/s, two flops per multiply-add
|
||||||
|
tflops = lambda ms: (2 * M * N * K) * 1e-12 / (ms * 1e-3)
|
||||||
|
return tflops(ms), tflops(max_ms), tflops(min_ms)
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_shapes(args):
|
||||||
|
KN_model_names = []
|
||||||
|
models_tps = list(itertools.product(args.models, args.tp_sizes))
|
||||||
|
for model, tp_size in models_tps:
|
||||||
|
assert model in WEIGHT_SHAPES
|
||||||
|
for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
|
||||||
|
KN[tp_split_dim] = KN[tp_split_dim] // tp_size
|
||||||
|
KN.append(model)
|
||||||
|
KN_model_names.append(KN)
|
||||||
|
return KN_model_names
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"--models",
|
||||||
|
nargs="+",
|
||||||
|
type=str,
|
||||||
|
default=["meta-llama/Llama-3.1-8B-Instruct"],
|
||||||
|
choices=[*WEIGHT_SHAPES.keys()],
|
||||||
|
help="List of models to benchmark",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--tp-sizes",
|
||||||
|
nargs="+",
|
||||||
|
type=int,
|
||||||
|
default=[1],
|
||||||
|
help="List of tensor parallel sizes",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
KN_model_names = prepare_shapes(args)
|
||||||
|
for K, N, model_name in KN_model_names:
|
||||||
|
print(f"{model_name}, N={N} K={K}, BF16 vs FP8 GEMMs TFLOP/s:")
|
||||||
|
benchmark.run(
|
||||||
|
print_data=True,
|
||||||
|
show_plots=True,
|
||||||
|
save_path=f"bench_fp8_res_n{N}_k{K}",
|
||||||
|
N=N,
|
||||||
|
K=K,
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Benchmark finished!")
|
||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
# Copyright (c) Microsoft Corporation.
|
# Copyright (c) Microsoft Corporation.
|
||||||
# Licensed under the MIT License.
|
# Licensed under the MIT License.
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
"""
|
"""
|
||||||
Benchmark the performance of the cutlass_moe_fp4 kernel vs the triton_moe
|
Benchmark the performance of the cutlass_moe_fp4 kernel vs the triton_moe
|
||||||
kernel. The cutlass_moe_fp4 kernel takes in fp4 quantized weights and 16-bit
|
kernel. The cutlass_moe_fp4 kernel takes in fp4 quantized weights and 16-bit
|
||||||
@@ -90,7 +91,7 @@ def bench_run(
|
|||||||
|
|
||||||
score = torch.randn((m, num_experts), device=device, dtype=dtype)
|
score = torch.randn((m, num_experts), device=device, dtype=dtype)
|
||||||
|
|
||||||
topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
|
topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
|
||||||
|
|
||||||
quant_blocksize = 16
|
quant_blocksize = 16
|
||||||
w1_blockscale = torch.empty(
|
w1_blockscale = torch.empty(
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as benchmark
|
import torch.utils.benchmark as benchmark
|
||||||
@@ -6,8 +7,8 @@ from benchmark_shapes import WEIGHT_SHAPES_MOE
|
|||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
||||||
|
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
|
||||||
from vllm.model_executor.layers.fused_moe.fused_moe import (
|
from vllm.model_executor.layers.fused_moe.fused_moe import (
|
||||||
cutlass_moe_fp8,
|
|
||||||
fused_experts,
|
fused_experts,
|
||||||
fused_topk,
|
fused_topk,
|
||||||
)
|
)
|
||||||
@@ -69,18 +70,9 @@ def bench_run(
|
|||||||
w1_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)
|
w1_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)
|
||||||
w2_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)
|
w2_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)
|
||||||
|
|
||||||
ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
|
|
||||||
c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64)
|
|
||||||
ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64)
|
|
||||||
c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
|
|
||||||
|
|
||||||
for expert in range(num_experts):
|
for expert in range(num_experts):
|
||||||
w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert])
|
w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert])
|
||||||
w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert])
|
w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert])
|
||||||
w1_q_notransp = w1_q.clone()
|
|
||||||
w2_q_notransp = w2_q.clone()
|
|
||||||
w1_q = w1_q.transpose(1, 2)
|
|
||||||
w2_q = w2_q.transpose(1, 2)
|
|
||||||
|
|
||||||
score = torch.randn((m, num_experts), device="cuda", dtype=dtype)
|
score = torch.randn((m, num_experts), device="cuda", dtype=dtype)
|
||||||
|
|
||||||
@@ -121,10 +113,6 @@ def bench_run(
|
|||||||
w2_scale: torch.Tensor,
|
w2_scale: torch.Tensor,
|
||||||
topk_weights: torch.Tensor,
|
topk_weights: torch.Tensor,
|
||||||
topk_ids: torch.Tensor,
|
topk_ids: torch.Tensor,
|
||||||
ab_strides1: torch.Tensor,
|
|
||||||
c_strides1: torch.Tensor,
|
|
||||||
ab_strides2: torch.Tensor,
|
|
||||||
c_strides2: torch.Tensor,
|
|
||||||
num_repeats: int,
|
num_repeats: int,
|
||||||
):
|
):
|
||||||
for _ in range(num_repeats):
|
for _ in range(num_repeats):
|
||||||
@@ -132,14 +120,10 @@ def bench_run(
|
|||||||
a,
|
a,
|
||||||
w1,
|
w1,
|
||||||
w2,
|
w2,
|
||||||
w1_scale,
|
|
||||||
w2_scale,
|
|
||||||
topk_weights,
|
topk_weights,
|
||||||
topk_ids,
|
topk_ids,
|
||||||
ab_strides1,
|
w1_scale,
|
||||||
c_strides1,
|
w2_scale,
|
||||||
ab_strides2,
|
|
||||||
c_strides2,
|
|
||||||
a1_scale=a_scale,
|
a1_scale=a_scale,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -152,10 +136,6 @@ def bench_run(
|
|||||||
w2_scale: torch.Tensor,
|
w2_scale: torch.Tensor,
|
||||||
topk_weights: torch.Tensor,
|
topk_weights: torch.Tensor,
|
||||||
topk_ids: torch.Tensor,
|
topk_ids: torch.Tensor,
|
||||||
ab_strides1: torch.Tensor,
|
|
||||||
c_strides1: torch.Tensor,
|
|
||||||
ab_strides2: torch.Tensor,
|
|
||||||
c_strides2: torch.Tensor,
|
|
||||||
):
|
):
|
||||||
with set_current_vllm_config(
|
with set_current_vllm_config(
|
||||||
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
|
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
|
||||||
@@ -164,14 +144,10 @@ def bench_run(
|
|||||||
a,
|
a,
|
||||||
w1_q,
|
w1_q,
|
||||||
w2_q,
|
w2_q,
|
||||||
w1_scale,
|
|
||||||
w2_scale,
|
|
||||||
topk_weights,
|
topk_weights,
|
||||||
topk_ids,
|
topk_ids,
|
||||||
ab_strides1,
|
w1_scale,
|
||||||
c_strides1,
|
w2_scale,
|
||||||
ab_strides2,
|
|
||||||
c_strides2,
|
|
||||||
a1_scale=a_scale,
|
a1_scale=a_scale,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -217,10 +193,6 @@ def bench_run(
|
|||||||
w2_scale,
|
w2_scale,
|
||||||
topk_weights,
|
topk_weights,
|
||||||
topk_ids,
|
topk_ids,
|
||||||
ab_strides1,
|
|
||||||
c_strides1,
|
|
||||||
ab_strides2,
|
|
||||||
c_strides2,
|
|
||||||
)
|
)
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
@@ -229,8 +201,8 @@ def bench_run(
|
|||||||
with torch.cuda.graph(triton_graph, stream=triton_stream):
|
with torch.cuda.graph(triton_graph, stream=triton_stream):
|
||||||
run_triton_from_graph(
|
run_triton_from_graph(
|
||||||
a,
|
a,
|
||||||
w1_q_notransp,
|
w1_q,
|
||||||
w2_q_notransp,
|
w2_q,
|
||||||
topk_weights,
|
topk_weights,
|
||||||
topk_ids,
|
topk_ids,
|
||||||
w1_scale,
|
w1_scale,
|
||||||
@@ -249,18 +221,12 @@ def bench_run(
|
|||||||
"w2": w2,
|
"w2": w2,
|
||||||
"score": score,
|
"score": score,
|
||||||
"topk": topk,
|
"topk": topk,
|
||||||
"w1_q_notransp": w1_q_notransp,
|
|
||||||
"w2_q_notransp": w2_q_notransp,
|
|
||||||
# Cutlass params
|
# Cutlass params
|
||||||
"a_scale": a_scale,
|
"a_scale": a_scale,
|
||||||
"w1_q": w1_q,
|
"w1_q": w1_q,
|
||||||
"w2_q": w2_q,
|
"w2_q": w2_q,
|
||||||
"w1_scale": w1_scale,
|
"w1_scale": w1_scale,
|
||||||
"w2_scale": w2_scale,
|
"w2_scale": w2_scale,
|
||||||
"ab_strides1": ab_strides1,
|
|
||||||
"c_strides1": c_strides1,
|
|
||||||
"ab_strides2": ab_strides2,
|
|
||||||
"c_strides2": c_strides2,
|
|
||||||
# cuda graph params
|
# cuda graph params
|
||||||
"cutlass_graph": cutlass_graph,
|
"cutlass_graph": cutlass_graph,
|
||||||
"triton_graph": triton_graph,
|
"triton_graph": triton_graph,
|
||||||
@@ -278,8 +244,8 @@ def bench_run(
|
|||||||
# Warmup
|
# Warmup
|
||||||
run_triton_moe(
|
run_triton_moe(
|
||||||
a,
|
a,
|
||||||
w1_q_notransp,
|
w1_q,
|
||||||
w2_q_notransp,
|
w2_q,
|
||||||
topk_weights,
|
topk_weights,
|
||||||
topk_ids,
|
topk_ids,
|
||||||
w1_scale,
|
w1_scale,
|
||||||
@@ -290,7 +256,7 @@ def bench_run(
|
|||||||
|
|
||||||
results.append(
|
results.append(
|
||||||
benchmark.Timer(
|
benchmark.Timer(
|
||||||
stmt="run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)", # noqa: E501
|
stmt="run_triton_moe(a, w1_q, w2_q, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)", # noqa: E501
|
||||||
globals=globals,
|
globals=globals,
|
||||||
label=label,
|
label=label,
|
||||||
sub_label=sub_label,
|
sub_label=sub_label,
|
||||||
@@ -321,16 +287,12 @@ def bench_run(
|
|||||||
w2_scale,
|
w2_scale,
|
||||||
topk_weights,
|
topk_weights,
|
||||||
topk_ids,
|
topk_ids,
|
||||||
ab_strides1,
|
|
||||||
c_strides1,
|
|
||||||
ab_strides2,
|
|
||||||
c_strides2,
|
|
||||||
num_warmup,
|
num_warmup,
|
||||||
)
|
)
|
||||||
|
|
||||||
results.append(
|
results.append(
|
||||||
benchmark.Timer(
|
benchmark.Timer(
|
||||||
stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2, num_runs)", # noqa: E501
|
stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, num_runs)", # noqa: E501
|
||||||
globals=globals,
|
globals=globals,
|
||||||
label=label,
|
label=label,
|
||||||
sub_label=sub_label,
|
sub_label=sub_label,
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import copy
|
import copy
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import copy
|
import copy
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as benchmark
|
import torch.utils.benchmark as benchmark
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
@@ -6,7 +7,6 @@ import time
|
|||||||
from contextlib import nullcontext
|
from contextlib import nullcontext
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from itertools import product
|
from itertools import product
|
||||||
from types import SimpleNamespace
|
|
||||||
from typing import Any, TypedDict
|
from typing import Any, TypedDict
|
||||||
|
|
||||||
import ray
|
import ray
|
||||||
@@ -42,7 +42,7 @@ def benchmark_config(
|
|||||||
use_fp8_w8a8: bool,
|
use_fp8_w8a8: bool,
|
||||||
use_int8_w8a16: bool,
|
use_int8_w8a16: bool,
|
||||||
num_iters: int = 100,
|
num_iters: int = 100,
|
||||||
block_quant_shape: List[int] = None,
|
block_quant_shape: list[int] = None,
|
||||||
use_deep_gemm: bool = False,
|
use_deep_gemm: bool = False,
|
||||||
) -> float:
|
) -> float:
|
||||||
init_dtype = torch.float16 if use_fp8_w8a8 else dtype
|
init_dtype = torch.float16 if use_fp8_w8a8 else dtype
|
||||||
@@ -399,7 +399,7 @@ class BenchmarkWorker:
|
|||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
use_fp8_w8a8: bool,
|
use_fp8_w8a8: bool,
|
||||||
use_int8_w8a16: bool,
|
use_int8_w8a16: bool,
|
||||||
block_quant_shape: List[int] = None,
|
block_quant_shape: list[int] = None,
|
||||||
use_deep_gemm: bool = False,
|
use_deep_gemm: bool = False,
|
||||||
) -> tuple[dict[str, int], float]:
|
) -> tuple[dict[str, int], float]:
|
||||||
current_platform.seed_everything(self.seed)
|
current_platform.seed_everything(self.seed)
|
||||||
@@ -531,7 +531,7 @@ def save_configs(
|
|||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
use_fp8_w8a8: bool,
|
use_fp8_w8a8: bool,
|
||||||
use_int8_w8a16: bool,
|
use_int8_w8a16: bool,
|
||||||
block_quant_shape: List[int],
|
block_quant_shape: list[int],
|
||||||
) -> None:
|
) -> None:
|
||||||
dtype_str = get_config_dtype_str(
|
dtype_str = get_config_dtype_str(
|
||||||
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
|
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
|
||||||
@@ -562,7 +562,6 @@ def main(args: argparse.Namespace):
|
|||||||
config = get_config(model=args.model, trust_remote_code=args.trust_remote_code)
|
config = get_config(model=args.model, trust_remote_code=args.trust_remote_code)
|
||||||
if args.model_prefix:
|
if args.model_prefix:
|
||||||
config = getattr(config, args.model_prefix)
|
config = getattr(config, args.model_prefix)
|
||||||
config = SimpleNamespace(**config)
|
|
||||||
|
|
||||||
if config.architectures[0] == "DbrxForCausalLM":
|
if config.architectures[0] == "DbrxForCausalLM":
|
||||||
E = config.ffn_config.moe_num_experts
|
E = config.ffn_config.moe_num_experts
|
||||||
@@ -594,11 +593,7 @@ def main(args: argparse.Namespace):
|
|||||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||||
|
|
||||||
hidden_size = config.hidden_size
|
hidden_size = config.hidden_size
|
||||||
dtype = (
|
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
|
||||||
torch.float16
|
|
||||||
if current_platform.is_rocm()
|
|
||||||
else getattr(torch, config.torch_dtype)
|
|
||||||
)
|
|
||||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||||
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
||||||
block_quant_shape = get_weight_block_size_safety(config)
|
block_quant_shape = get_weight_block_size_safety(config)
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
from typing import Any, TypedDict
|
from typing import Any, TypedDict
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import itertools
|
import itertools
|
||||||
from typing import Optional, Union
|
from typing import Optional, Union
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
from itertools import accumulate
|
from itertools import accumulate
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
@@ -22,7 +23,7 @@ def benchmark_rope_kernels_multi_lora(
|
|||||||
seed: int,
|
seed: int,
|
||||||
device: str,
|
device: str,
|
||||||
max_position: int = 8192,
|
max_position: int = 8192,
|
||||||
base: int = 10000,
|
base: float = 10000,
|
||||||
) -> None:
|
) -> None:
|
||||||
current_platform.seed_everything(seed)
|
current_platform.seed_everything(seed)
|
||||||
torch.set_default_device(device)
|
torch.set_default_device(device)
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
WEIGHT_SHAPES = {
|
WEIGHT_SHAPES = {
|
||||||
"ideal": [[4 * 256 * 32, 256 * 32]],
|
"ideal": [[4 * 256 * 32, 256 * 32]],
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
# Adapted from sglang quantization/tuning_block_wise_kernel.py
|
# Adapted from sglang quantization/tuning_block_wise_kernel.py
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
# fmt: off
|
# fmt: off
|
||||||
# ruff: noqa: E501
|
# ruff: noqa: E501
|
||||||
import time
|
import time
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import math
|
import math
|
||||||
import pickle
|
import pickle
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import dataclasses
|
import dataclasses
|
||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
# Weight Shapes are in the format
|
# Weight Shapes are in the format
|
||||||
# ([K, N], TP_SPLIT_DIM)
|
# ([K, N], TP_SPLIT_DIM)
|
||||||
@@ -48,4 +49,50 @@ WEIGHT_SHAPES = {
|
|||||||
([16384, 106496], 1),
|
([16384, 106496], 1),
|
||||||
([53248, 16384], 0),
|
([53248, 16384], 0),
|
||||||
],
|
],
|
||||||
|
"meta-llama/Llama-3.1-8B-Instruct": [
|
||||||
|
([4096, 6144], 1),
|
||||||
|
([4096, 4096], 0),
|
||||||
|
([4096, 28672], 1),
|
||||||
|
([14336, 4096], 0),
|
||||||
|
],
|
||||||
|
"meta-llama/Llama-3.3-70B-Instruct": [
|
||||||
|
([8192, 10240], 1),
|
||||||
|
([8192, 8192], 0),
|
||||||
|
([8192, 57344], 1),
|
||||||
|
([28672, 8192], 0),
|
||||||
|
],
|
||||||
|
"mistralai/Mistral-Large-Instruct-2407": [
|
||||||
|
([12288, 14336], 1),
|
||||||
|
([12288, 12288], 0),
|
||||||
|
([12288, 57344], 1),
|
||||||
|
([28672, 12288], 0),
|
||||||
|
],
|
||||||
|
"Qwen/Qwen2.5-7B-Instruct": [
|
||||||
|
([3584, 4608], 1),
|
||||||
|
([3584, 3584], 0),
|
||||||
|
([3584, 37888], 1),
|
||||||
|
([18944, 3584], 0),
|
||||||
|
],
|
||||||
|
"Qwen/Qwen2.5-32B-Instruct": [
|
||||||
|
([5120, 7168], 1),
|
||||||
|
([5120, 5120], 0),
|
||||||
|
([5120, 55296], 1),
|
||||||
|
([27648, 5120], 0),
|
||||||
|
],
|
||||||
|
"Qwen/Qwen2.5-72B-Instruct": [
|
||||||
|
([8192, 10240], 1),
|
||||||
|
([8192, 8192], 0),
|
||||||
|
([8192, 59136], 1),
|
||||||
|
([29568, 8192], 0),
|
||||||
|
],
|
||||||
|
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": [
|
||||||
|
([2048, 3072], 1),
|
||||||
|
([2048, 4096], 1),
|
||||||
|
([2048, 2048], 0),
|
||||||
|
([2048, 576], 0),
|
||||||
|
([2048, 21888], 1),
|
||||||
|
([10944, 2048], 0),
|
||||||
|
([2048, 2816], 1),
|
||||||
|
([1408, 2048], 0),
|
||||||
|
],
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import cProfile
|
import cProfile
|
||||||
import pstats
|
import pstats
|
||||||
|
|||||||
@@ -75,6 +75,7 @@ if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
|
|||||||
else()
|
else()
|
||||||
find_isa(${CPUINFO} "avx2" AVX2_FOUND)
|
find_isa(${CPUINFO} "avx2" AVX2_FOUND)
|
||||||
find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
|
find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
|
||||||
|
find_isa(${CPUINFO} "Power11" POWER11_FOUND)
|
||||||
find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
|
find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
|
||||||
find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
|
find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
|
||||||
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
|
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
|
||||||
@@ -106,13 +107,19 @@ elseif (AVX2_FOUND)
|
|||||||
list(APPEND CXX_COMPILE_FLAGS "-mavx2")
|
list(APPEND CXX_COMPILE_FLAGS "-mavx2")
|
||||||
message(WARNING "vLLM CPU backend using AVX2 ISA")
|
message(WARNING "vLLM CPU backend using AVX2 ISA")
|
||||||
|
|
||||||
elseif (POWER9_FOUND OR POWER10_FOUND)
|
elseif (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
|
||||||
message(STATUS "PowerPC detected")
|
message(STATUS "PowerPC detected")
|
||||||
# Check for PowerPC VSX support
|
if (POWER9_FOUND)
|
||||||
list(APPEND CXX_COMPILE_FLAGS
|
list(APPEND CXX_COMPILE_FLAGS
|
||||||
"-mvsx"
|
"-mvsx"
|
||||||
"-mcpu=native"
|
"-mcpu=power9"
|
||||||
"-mtune=native")
|
"-mtune=power9")
|
||||||
|
elseif (POWER10_FOUND OR POWER11_FOUND)
|
||||||
|
list(APPEND CXX_COMPILE_FLAGS
|
||||||
|
"-mvsx"
|
||||||
|
"-mcpu=power10"
|
||||||
|
"-mtune=power10")
|
||||||
|
endif()
|
||||||
|
|
||||||
elseif (ASIMD_FOUND)
|
elseif (ASIMD_FOUND)
|
||||||
message(STATUS "ARMv8 or later architecture detected")
|
message(STATUS "ARMv8 or later architecture detected")
|
||||||
|
|||||||
@@ -46,22 +46,38 @@ else()
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
||||||
|
# Ensure the vllm/vllm_flash_attn directory exists before installation
|
||||||
|
install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" ALL_COMPONENTS)
|
||||||
|
|
||||||
|
# Make sure vllm-flash-attn install rules are nested under vllm/
|
||||||
|
# This is here to support installing all components under the same prefix with cmake --install.
|
||||||
|
# setup.py installs every component separately but uses the same prefix for all.
|
||||||
|
# ALL_COMPONENTS is used to avoid duplication for FA2 and FA3,
|
||||||
|
# and these statements don't hurt when installing neither component.
|
||||||
|
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" ALL_COMPONENTS)
|
||||||
|
install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
|
||||||
|
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" ALL_COMPONENTS)
|
||||||
|
|
||||||
# Fetch the vllm-flash-attn library
|
# Fetch the vllm-flash-attn library
|
||||||
FetchContent_MakeAvailable(vllm-flash-attn)
|
FetchContent_MakeAvailable(vllm-flash-attn)
|
||||||
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
|
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
|
||||||
|
|
||||||
|
# Restore the install prefix
|
||||||
|
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
|
||||||
|
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
||||||
|
|
||||||
# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
|
# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
|
||||||
# case only one is built, in the case both are built redundant work is done)
|
# case only one is built, in the case both are built redundant work is done)
|
||||||
install(
|
install(
|
||||||
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
||||||
DESTINATION vllm_flash_attn
|
DESTINATION vllm/vllm_flash_attn
|
||||||
COMPONENT _vllm_fa2_C
|
COMPONENT _vllm_fa2_C
|
||||||
FILES_MATCHING PATTERN "*.py"
|
FILES_MATCHING PATTERN "*.py"
|
||||||
)
|
)
|
||||||
|
|
||||||
install(
|
install(
|
||||||
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
||||||
DESTINATION vllm_flash_attn
|
DESTINATION vllm/vllm_flash_attn
|
||||||
COMPONENT _vllm_fa3_C
|
COMPONENT _vllm_fa3_C
|
||||||
FILES_MATCHING PATTERN "*.py"
|
FILES_MATCHING PATTERN "*.py"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
#
|
#
|
||||||
# A command line tool for running pytorch's hipify preprocessor on CUDA
|
# A command line tool for running pytorch's hipify preprocessor on CUDA
|
||||||
|
|||||||
@@ -76,7 +76,7 @@ function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
|
|||||||
set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc)
|
set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc)
|
||||||
add_custom_target(
|
add_custom_target(
|
||||||
hipify${NAME}
|
hipify${NAME}
|
||||||
COMMAND ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS}
|
COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS}
|
||||||
DEPENDS ${CMAKE_SOURCE_DIR}/cmake/hipify.py ${SRCS}
|
DEPENDS ${CMAKE_SOURCE_DIR}/cmake/hipify.py ${SRCS}
|
||||||
BYPRODUCTS ${HIP_SRCS}
|
BYPRODUCTS ${HIP_SRCS}
|
||||||
COMMENT "Running hipify on ${NAME} extension source files.")
|
COMMENT "Running hipify on ${NAME} extension source files.")
|
||||||
|
|||||||
@@ -119,7 +119,7 @@ typename T::Fmha::Arguments args_from_options(
|
|||||||
{static_cast<ElementOut*>(out.data_ptr()), stride_O,
|
{static_cast<ElementOut*>(out.data_ptr()), stride_O,
|
||||||
static_cast<ElementAcc*>(nullptr), stride_LSE},
|
static_cast<ElementAcc*>(nullptr), stride_LSE},
|
||||||
hw_info,
|
hw_info,
|
||||||
-1, // split_kv
|
1, // split_kv
|
||||||
nullptr, // is_var_split_kv
|
nullptr, // is_var_split_kv
|
||||||
};
|
};
|
||||||
// TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
|
// TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import enum
|
import enum
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import glob
|
import glob
|
||||||
import itertools
|
import itertools
|
||||||
import os
|
import os
|
||||||
|
|||||||
@@ -30,4 +30,8 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
|
|||||||
int64_t BLOCK_SIZE_K, int64_t bit);
|
int64_t BLOCK_SIZE_K, int64_t bit);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
bool moe_permute_unpermute_supported();
|
bool moe_permute_unpermute_supported();
|
||||||
|
|
||||||
|
void shuffle_rows(const torch::Tensor& input_tensor,
|
||||||
|
const torch::Tensor& dst2src_map,
|
||||||
|
torch::Tensor& output_tensor);
|
||||||
@@ -130,6 +130,62 @@ void moe_unpermute(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
__global__ void shuffleInputRowsKernel(const T* input,
|
||||||
|
const int32_t* dst2src_map, T* output,
|
||||||
|
int64_t num_src_rows,
|
||||||
|
int64_t num_dst_rows, int64_t num_cols) {
|
||||||
|
int64_t dest_row_idx = blockIdx.x;
|
||||||
|
int64_t const source_row_idx = dst2src_map[dest_row_idx];
|
||||||
|
|
||||||
|
if (blockIdx.x < num_dst_rows) {
|
||||||
|
// Load 128-bits per thread
|
||||||
|
constexpr int64_t ELEM_PER_THREAD = 128 / sizeof(T) / 8;
|
||||||
|
using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;
|
||||||
|
|
||||||
|
// Duplicate and permute rows
|
||||||
|
auto const* source_row_ptr =
|
||||||
|
reinterpret_cast<DataElem const*>(input + source_row_idx * num_cols);
|
||||||
|
auto* dest_row_ptr =
|
||||||
|
reinterpret_cast<DataElem*>(output + dest_row_idx * num_cols);
|
||||||
|
|
||||||
|
int64_t const start_offset = threadIdx.x;
|
||||||
|
int64_t const stride = blockDim.x;
|
||||||
|
int64_t const num_elems_in_col = num_cols / ELEM_PER_THREAD;
|
||||||
|
|
||||||
|
for (int elem_index = start_offset; elem_index < num_elems_in_col;
|
||||||
|
elem_index += stride) {
|
||||||
|
dest_row_ptr[elem_index] = source_row_ptr[elem_index];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void shuffle_rows(const torch::Tensor& input_tensor,
|
||||||
|
const torch::Tensor& dst2src_map,
|
||||||
|
torch::Tensor& output_tensor) {
|
||||||
|
TORCH_CHECK(input_tensor.scalar_type() == output_tensor.scalar_type(),
|
||||||
|
"Input and output tensors must have the same data type");
|
||||||
|
|
||||||
|
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
||||||
|
int64_t const blocks = output_tensor.size(0);
|
||||||
|
int64_t const threads = 256;
|
||||||
|
int64_t const num_dest_rows = output_tensor.size(0);
|
||||||
|
int64_t const num_src_rows = input_tensor.size(0);
|
||||||
|
int64_t const num_cols = input_tensor.size(1);
|
||||||
|
|
||||||
|
TORCH_CHECK(!(num_cols % (128 / sizeof(input_tensor.scalar_type()) / 8)),
|
||||||
|
"num_cols must be divisible by 128 / "
|
||||||
|
"sizeof(input_tensor.scalar_type()) / 8");
|
||||||
|
|
||||||
|
MOE_DISPATCH(input_tensor.scalar_type(), [&] {
|
||||||
|
shuffleInputRowsKernel<scalar_t><<<blocks, threads, 0, stream>>>(
|
||||||
|
reinterpret_cast<scalar_t*>(input_tensor.data_ptr()),
|
||||||
|
dst2src_map.data_ptr<int32_t>(),
|
||||||
|
reinterpret_cast<scalar_t*>(output_tensor.data_ptr()), num_src_rows,
|
||||||
|
num_dest_rows, num_cols);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
|
void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
|
||||||
|
|||||||
@@ -14,12 +14,13 @@
|
|||||||
__VA_ARGS__(); \
|
__VA_ARGS__(); \
|
||||||
break; \
|
break; \
|
||||||
}
|
}
|
||||||
#define MOE_DISPATCH_FLOAT_CASE(...) \
|
#define MOE_DISPATCH_FLOAT_CASE(...) \
|
||||||
MOE_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
MOE_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
||||||
MOE_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
|
MOE_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
|
||||||
MOE_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
|
MOE_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
|
||||||
MOE_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) \
|
MOE_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) \
|
||||||
MOE_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__)
|
MOE_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \
|
||||||
|
MOE_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)
|
||||||
|
|
||||||
#define MOE_DISPATCH(TYPE, ...) \
|
#define MOE_DISPATCH(TYPE, ...) \
|
||||||
MOE_SWITCH(TYPE, MOE_DISPATCH_FLOAT_CASE(__VA_ARGS__))
|
MOE_SWITCH(TYPE, MOE_DISPATCH_FLOAT_CASE(__VA_ARGS__))
|
||||||
@@ -39,6 +40,11 @@ template <>
|
|||||||
struct ScalarType2CudaType<at::ScalarType::BFloat16> {
|
struct ScalarType2CudaType<at::ScalarType::BFloat16> {
|
||||||
using type = __nv_bfloat16;
|
using type = __nv_bfloat16;
|
||||||
};
|
};
|
||||||
|
// uint8 for packed fp4
|
||||||
|
template <>
|
||||||
|
struct ScalarType2CudaType<at::ScalarType::Byte> {
|
||||||
|
using type = uint8_t;
|
||||||
|
};
|
||||||
|
|
||||||
// #if __CUDA_ARCH__ >= 890
|
// #if __CUDA_ARCH__ >= 890
|
||||||
// fp8
|
// fp8
|
||||||
|
|||||||
@@ -516,9 +516,8 @@ void topk_softmax(
|
|||||||
topk,
|
topk,
|
||||||
stream);
|
stream);
|
||||||
}
|
}
|
||||||
else
|
else if (topk_indices.scalar_type() == at::ScalarType::UInt32)
|
||||||
{
|
{
|
||||||
assert(topk_indices.scalar_type() == at::ScalarType::UInt32);
|
|
||||||
vllm::moe::topkGatingSoftmaxKernelLauncher(
|
vllm::moe::topkGatingSoftmaxKernelLauncher(
|
||||||
gating_output.data_ptr<float>(),
|
gating_output.data_ptr<float>(),
|
||||||
topk_weights.data_ptr<float>(),
|
topk_weights.data_ptr<float>(),
|
||||||
@@ -530,4 +529,17 @@ void topk_softmax(
|
|||||||
topk,
|
topk,
|
||||||
stream);
|
stream);
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
assert(topk_indices.scalar_type() == at::ScalarType::Int64);
|
||||||
|
vllm::moe::topkGatingSoftmaxKernelLauncher(
|
||||||
|
gating_output.data_ptr<float>(),
|
||||||
|
topk_weights.data_ptr<float>(),
|
||||||
|
topk_indices.data_ptr<int64_t>(),
|
||||||
|
token_expert_indices.data_ptr<int>(),
|
||||||
|
softmax_workspace.data_ptr<float>(),
|
||||||
|
num_tokens,
|
||||||
|
num_experts,
|
||||||
|
topk,
|
||||||
|
stream);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -81,6 +81,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
|||||||
m.def("moe_permute_unpermute_supported() -> bool");
|
m.def("moe_permute_unpermute_supported() -> bool");
|
||||||
m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported);
|
m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported);
|
||||||
|
|
||||||
|
// Row shuffle for MoE
|
||||||
|
m.def(
|
||||||
|
"shuffle_rows(Tensor input_tensor, Tensor dst2src_map, Tensor! "
|
||||||
|
"output_tensor) -> ()");
|
||||||
|
m.impl("shuffle_rows", torch::kCUDA, &shuffle_rows);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
19
csrc/ops.h
19
csrc/ops.h
@@ -92,6 +92,11 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
|
|||||||
void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
|
void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
|
||||||
torch::Tensor& weight, double epsilon);
|
torch::Tensor& weight, double epsilon);
|
||||||
|
|
||||||
|
void apply_repetition_penalties_(torch::Tensor& logits,
|
||||||
|
const torch::Tensor& prompt_mask,
|
||||||
|
const torch::Tensor& output_mask,
|
||||||
|
const torch::Tensor& repetition_penalties);
|
||||||
|
|
||||||
void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
|
void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
|
||||||
torch::Tensor& weight, torch::Tensor& scale,
|
torch::Tensor& weight, torch::Tensor& scale,
|
||||||
double epsilon);
|
double epsilon);
|
||||||
@@ -231,7 +236,8 @@ void cutlass_moe_mm(
|
|||||||
torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
|
torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
|
||||||
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
|
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
|
||||||
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
|
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
|
||||||
torch::Tensor const& b_strides, torch::Tensor const& c_strides);
|
torch::Tensor const& b_strides, torch::Tensor const& c_strides,
|
||||||
|
bool per_act_token, bool per_out_ch);
|
||||||
|
|
||||||
void cutlass_fp4_group_mm(
|
void cutlass_fp4_group_mm(
|
||||||
torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
|
torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
|
||||||
@@ -243,7 +249,16 @@ void get_cutlass_moe_mm_data(
|
|||||||
const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
|
const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
|
||||||
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
|
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
|
||||||
torch::Tensor& input_permutation, torch::Tensor& output_permutation,
|
torch::Tensor& input_permutation, torch::Tensor& output_permutation,
|
||||||
const int64_t num_experts, const int64_t n, const int64_t k);
|
const int64_t num_experts, const int64_t n, const int64_t k,
|
||||||
|
const std::optional<torch::Tensor>& blockscale_offsets);
|
||||||
|
|
||||||
|
void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
|
||||||
|
torch::Tensor& problem_sizes1,
|
||||||
|
torch::Tensor& problem_sizes2,
|
||||||
|
const torch::Tensor& expert_num_tokens,
|
||||||
|
const int64_t num_local_experts,
|
||||||
|
const int64_t padded_m, const int64_t n,
|
||||||
|
const int64_t k);
|
||||||
|
|
||||||
void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
|
void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
|
||||||
torch::Tensor const& b,
|
torch::Tensor const& b,
|
||||||
|
|||||||
@@ -9,10 +9,6 @@ void cutlass_scaled_mm_blockwise_sm100_fp8(torch::Tensor& out,
|
|||||||
torch::Tensor const& b,
|
torch::Tensor const& b,
|
||||||
torch::Tensor const& a_scales,
|
torch::Tensor const& a_scales,
|
||||||
torch::Tensor const& b_scales) {
|
torch::Tensor const& b_scales) {
|
||||||
TORCH_CHECK(
|
|
||||||
a.size(0) % 4 == 0,
|
|
||||||
"Input tensor must have a number of rows that is a multiple of 4. ",
|
|
||||||
"but got: ", a.size(0), " rows.");
|
|
||||||
if (out.dtype() == torch::kBFloat16) {
|
if (out.dtype() == torch::kBFloat16) {
|
||||||
cutlass_gemm_blockwise_sm100_fp8_dispatch<cutlass::bfloat16_t>(
|
cutlass_gemm_blockwise_sm100_fp8_dispatch<cutlass::bfloat16_t>(
|
||||||
out, a, b, a_scales, b_scales);
|
out, a, b, a_scales, b_scales);
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include "cuda_utils.h"
|
||||||
#include "cutlass/cutlass.h"
|
#include "cutlass/cutlass.h"
|
||||||
#include "cutlass/numeric_types.h"
|
#include "cutlass/numeric_types.h"
|
||||||
|
|
||||||
@@ -22,49 +23,49 @@ namespace vllm {
|
|||||||
|
|
||||||
using namespace cute;
|
using namespace cute;
|
||||||
|
|
||||||
template <typename OutType, typename MmaTileShape, typename ScalesPerTile,
|
// clang-format off
|
||||||
class ClusterShape, typename EpilogueScheduler,
|
template <class OutType, int ScaleGranularityM,
|
||||||
typename MainloopScheduler>
|
int ScaleGranularityN, int ScaleGranularityK,
|
||||||
|
class MmaTileShape, class ClusterShape,
|
||||||
|
class EpilogueScheduler, class MainloopScheduler,
|
||||||
|
bool swap_ab_ = false>
|
||||||
struct cutlass_3x_gemm_fp8_blockwise {
|
struct cutlass_3x_gemm_fp8_blockwise {
|
||||||
|
static constexpr bool swap_ab = swap_ab_;
|
||||||
using ElementAB = cutlass::float_e4m3_t;
|
using ElementAB = cutlass::float_e4m3_t;
|
||||||
|
|
||||||
using ElementA = ElementAB;
|
using ElementA = ElementAB;
|
||||||
using LayoutA = cutlass::layout::RowMajor;
|
using LayoutA = cutlass::layout::RowMajor;
|
||||||
|
using LayoutA_Transpose = typename cutlass::layout::LayoutTranspose<LayoutA>::type;
|
||||||
static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
|
static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
|
||||||
|
|
||||||
using ElementB = ElementAB;
|
using ElementB = ElementAB;
|
||||||
using LayoutB = cutlass::layout::ColumnMajor;
|
using LayoutB = cutlass::layout::ColumnMajor;
|
||||||
|
using LayoutB_Transpose = typename cutlass::layout::LayoutTranspose<LayoutB>::type;
|
||||||
static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
|
static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
|
||||||
|
|
||||||
using ElementC = void;
|
|
||||||
using ElementD = OutType;
|
using ElementD = OutType;
|
||||||
using LayoutD = cutlass::layout::RowMajor;
|
using LayoutD = cutlass::layout::RowMajor;
|
||||||
|
using LayoutD_Transpose = typename cutlass::layout::LayoutTranspose<LayoutD>::type;
|
||||||
static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
|
static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
|
||||||
|
|
||||||
|
using ElementC = void; // TODO: support bias
|
||||||
using LayoutC = LayoutD;
|
using LayoutC = LayoutD;
|
||||||
|
using LayoutC_Transpose = LayoutD_Transpose;
|
||||||
static constexpr int AlignmentC = AlignmentD;
|
static constexpr int AlignmentC = AlignmentD;
|
||||||
|
|
||||||
using ElementAccumulator = float;
|
using ElementAccumulator = float;
|
||||||
using ElementCompute = float;
|
using ElementCompute = float;
|
||||||
using ElementBlockScale = float;
|
using ElementBlockScale = float;
|
||||||
|
|
||||||
// MMA and Cluster Tile Shapes
|
using ScaleConfig = conditional_t<swap_ab,
|
||||||
// Shape of the tile computed by tcgen05 MMA, could be across 2 SMs if Cluster
|
cutlass::detail::Sm100BlockwiseScaleConfig<
|
||||||
// Shape %2 == 0 using MmaTileShape_MNK = Shape<_128,_128,_128>;
|
ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
|
||||||
static constexpr int ScaleMsPerTile = size<0>(ScalesPerTile{});
|
cute::UMMA::Major::K, cute::UMMA::Major::MN>,
|
||||||
static constexpr int ScaleGranularityM =
|
cutlass::detail::Sm100BlockwiseScaleConfig<
|
||||||
size<0>(MmaTileShape{}) / ScaleMsPerTile;
|
ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
|
||||||
static constexpr int ScaleGranularityN =
|
cute::UMMA::Major::MN, cute::UMMA::Major::K>>;
|
||||||
size<1>(MmaTileShape{}) / size<1>(ScalesPerTile{});
|
|
||||||
static constexpr int ScaleGranularityK =
|
|
||||||
size<2>(MmaTileShape{}) / size<2>(ScalesPerTile{});
|
|
||||||
|
|
||||||
// Shape of the threadblocks in a cluster
|
// layout_SFA and layout_SFB cannot be swapped since they are deduced.
|
||||||
using ClusterShape_MNK = ClusterShape;
|
|
||||||
|
|
||||||
using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig<
|
|
||||||
ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
|
|
||||||
cute::UMMA::Major::MN, cute::UMMA::Major::K>;
|
|
||||||
using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
|
using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
|
||||||
using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
|
using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
|
||||||
|
|
||||||
@@ -73,7 +74,6 @@ struct cutlass_3x_gemm_fp8_blockwise {
|
|||||||
|
|
||||||
static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
|
static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
|
||||||
using ElementScalar = float;
|
using ElementScalar = float;
|
||||||
// clang-format off
|
|
||||||
using DefaultOperation = cutlass::epilogue::fusion::LinearCombination<ElementD, ElementCompute, ElementC, ElementScalar, RoundStyle>;
|
using DefaultOperation = cutlass::epilogue::fusion::LinearCombination<ElementD, ElementCompute, ElementC, ElementScalar, RoundStyle>;
|
||||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||||
ArchTag,
|
ArchTag,
|
||||||
@@ -84,33 +84,47 @@ struct cutlass_3x_gemm_fp8_blockwise {
|
|||||||
ElementAccumulator,
|
ElementAccumulator,
|
||||||
ElementCompute,
|
ElementCompute,
|
||||||
ElementC,
|
ElementC,
|
||||||
LayoutC,
|
conditional_t<swap_ab, LayoutC_Transpose, LayoutC>,
|
||||||
AlignmentC,
|
AlignmentC,
|
||||||
ElementD,
|
ElementD,
|
||||||
LayoutD,
|
conditional_t<swap_ab, LayoutD_Transpose, LayoutD>,
|
||||||
AlignmentD,
|
AlignmentD,
|
||||||
EpilogueScheduler,
|
EpilogueScheduler,
|
||||||
DefaultOperation
|
DefaultOperation
|
||||||
>::CollectiveOp;
|
>::CollectiveOp;
|
||||||
|
|
||||||
using StageCountType = cutlass::gemm::collective::StageCountAuto;
|
using StageCountType = cutlass::gemm::collective::StageCountAuto;
|
||||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
using CollectiveMainloop = conditional_t<swap_ab,
|
||||||
ArchTag,
|
typename cutlass::gemm::collective::CollectiveBuilder<
|
||||||
OperatorClass,
|
ArchTag,
|
||||||
ElementA,
|
OperatorClass,
|
||||||
cute::tuple<LayoutA, LayoutSFA>,
|
ElementB,
|
||||||
AlignmentA,
|
cute::tuple<LayoutB_Transpose, LayoutSFA>,
|
||||||
ElementB,
|
AlignmentB,
|
||||||
cute::tuple<LayoutB, LayoutSFB>,
|
ElementA,
|
||||||
AlignmentB,
|
cute::tuple<LayoutA_Transpose, LayoutSFB>,
|
||||||
ElementAccumulator,
|
AlignmentA,
|
||||||
MmaTileShape,
|
ElementAccumulator,
|
||||||
ClusterShape,
|
MmaTileShape,
|
||||||
|
ClusterShape,
|
||||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||||
MainloopScheduler
|
MainloopScheduler
|
||||||
>::CollectiveOp;
|
>::CollectiveOp,
|
||||||
// clang-format on
|
typename cutlass::gemm::collective::CollectiveBuilder<
|
||||||
|
ArchTag,
|
||||||
|
OperatorClass,
|
||||||
|
ElementA,
|
||||||
|
cute::tuple<LayoutA, LayoutSFA>,
|
||||||
|
AlignmentA,
|
||||||
|
ElementB,
|
||||||
|
cute::tuple<LayoutB, LayoutSFB>,
|
||||||
|
AlignmentB,
|
||||||
|
ElementAccumulator,
|
||||||
|
MmaTileShape,
|
||||||
|
ClusterShape,
|
||||||
|
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||||
|
MainloopScheduler
|
||||||
|
>::CollectiveOp>;
|
||||||
|
|
||||||
using KernelType = enable_sm100_only<cutlass::gemm::kernel::GemmUniversal<
|
using KernelType = enable_sm100_only<cutlass::gemm::kernel::GemmUniversal<
|
||||||
Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue>>;
|
Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue>>;
|
||||||
@@ -123,6 +137,7 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
|
|||||||
torch::Tensor const& b,
|
torch::Tensor const& b,
|
||||||
torch::Tensor const& a_scales,
|
torch::Tensor const& a_scales,
|
||||||
torch::Tensor const& b_scales) {
|
torch::Tensor const& b_scales) {
|
||||||
|
static constexpr bool swap_ab = Gemm::swap_ab;
|
||||||
using GemmKernel = typename Gemm::GemmKernel;
|
using GemmKernel = typename Gemm::GemmKernel;
|
||||||
using StrideA = typename Gemm::GemmKernel::StrideA;
|
using StrideA = typename Gemm::GemmKernel::StrideA;
|
||||||
using StrideB = typename Gemm::GemmKernel::StrideB;
|
using StrideB = typename Gemm::GemmKernel::StrideB;
|
||||||
@@ -136,7 +151,6 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
|
|||||||
using ElementD = typename Gemm::ElementD;
|
using ElementD = typename Gemm::ElementD;
|
||||||
|
|
||||||
int32_t m = a.size(0), n = b.size(1), k = a.size(1);
|
int32_t m = a.size(0), n = b.size(1), k = a.size(1);
|
||||||
auto prob_shape = cute::make_shape(m, n, k, 1);
|
|
||||||
|
|
||||||
StrideA a_stride;
|
StrideA a_stride;
|
||||||
StrideB b_stride;
|
StrideB b_stride;
|
||||||
@@ -146,11 +160,13 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
|
|||||||
b_stride =
|
b_stride =
|
||||||
cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
|
cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
|
||||||
c_stride =
|
c_stride =
|
||||||
cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(m, n, 1));
|
cutlass::make_cute_packed_stride(StrideC{}, swap_ab ? cute::make_shape(n, m, 1) : cute::make_shape(m, n, 1));
|
||||||
|
|
||||||
LayoutSFA layout_SFA =
|
LayoutSFA layout_SFA = swap_ab ?
|
||||||
|
ScaleConfig::tile_atom_to_shape_SFA(make_shape(n, m, k, 1)) :
|
||||||
ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1));
|
ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1));
|
||||||
LayoutSFB layout_SFB =
|
LayoutSFB layout_SFB = swap_ab ?
|
||||||
|
ScaleConfig::tile_atom_to_shape_SFB(make_shape(n, m, k, 1)) :
|
||||||
ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));
|
ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));
|
||||||
|
|
||||||
auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
|
auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
|
||||||
@@ -158,9 +174,22 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
|
|||||||
auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr());
|
auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr());
|
||||||
auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr());
|
auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr());
|
||||||
|
|
||||||
typename GemmKernel::MainloopArguments mainloop_args{
|
auto mainloop_args = [&](){
|
||||||
a_ptr, a_stride, b_ptr, b_stride,
|
// layout_SFA and layout_SFB cannot be swapped since they are deduced.
|
||||||
a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB};
|
if (swap_ab) {
|
||||||
|
return typename GemmKernel::MainloopArguments{
|
||||||
|
b_ptr, b_stride, a_ptr, a_stride,
|
||||||
|
b_scales_ptr, layout_SFA, a_scales_ptr, layout_SFB
|
||||||
|
};
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return typename GemmKernel::MainloopArguments{
|
||||||
|
a_ptr, a_stride, b_ptr, b_stride,
|
||||||
|
a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}();
|
||||||
|
auto prob_shape = swap_ab ? cute::make_shape(n, m, k, 1) : cute::make_shape(m, n, k, 1);
|
||||||
|
|
||||||
auto c_ptr = static_cast<ElementD*>(out.data_ptr());
|
auto c_ptr = static_cast<ElementD*>(out.data_ptr());
|
||||||
typename GemmKernel::EpilogueArguments epilogue_args{
|
typename GemmKernel::EpilogueArguments epilogue_args{
|
||||||
@@ -175,29 +204,74 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
|
|||||||
torch::Tensor const& b,
|
torch::Tensor const& b,
|
||||||
torch::Tensor const& a_scales,
|
torch::Tensor const& a_scales,
|
||||||
torch::Tensor const& b_scales) {
|
torch::Tensor const& b_scales) {
|
||||||
auto m = a.size(0);
|
int32_t m = a.size(0), n = b.size(1), k = a.size(1), sms;
|
||||||
auto k = a.size(1);
|
|
||||||
auto n = b.size(1);
|
|
||||||
int sms;
|
|
||||||
cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device());
|
cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device());
|
||||||
|
|
||||||
auto should_use_2sm = [&sms](int m, int n, int tile1SM = 128) {
|
constexpr int TILE_K = 128;
|
||||||
return std::ceil(static_cast<float>(m) / tile1SM) *
|
// TODO: better heuristics
|
||||||
std::ceil(static_cast<float>(n) / tile1SM) >=
|
bool swap_ab = (m < 16) || (m % 4 != 0);
|
||||||
sms;
|
bool use_tma_epilogue = (m * n) % 4 == 0;
|
||||||
};
|
if (!swap_ab) {
|
||||||
bool use_2sm = should_use_2sm(m, n);
|
constexpr int TILE_N = 128;
|
||||||
if (use_2sm) {
|
int tile_m = 256;
|
||||||
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
if (cuda_utils::ceil_div(n, TILE_N) * cuda_utils::ceil_div(m, 64) <= sms) {
|
||||||
OutType, Shape<_256, _128, _128>, Shape<_256, _1, _1>,
|
tile_m = 64;
|
||||||
Shape<_2, _2, _1>, cutlass::epilogue::TmaWarpSpecialized2Sm,
|
}
|
||||||
cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
|
else if (cuda_utils::ceil_div(n, TILE_N) * cuda_utils::ceil_div(m, 128) <= sms) {
|
||||||
out, a, b, a_scales, b_scales);
|
tile_m = 128;
|
||||||
|
}
|
||||||
|
if (tile_m == 64) {
|
||||||
|
if (use_tma_epilogue) {
|
||||||
|
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
||||||
|
OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
|
||||||
|
Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
|
||||||
|
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
|
||||||
|
out, a, b, a_scales, b_scales);
|
||||||
|
} else {
|
||||||
|
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
||||||
|
OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
|
||||||
|
Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
|
||||||
|
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
|
||||||
|
out, a, b, a_scales, b_scales);
|
||||||
|
}
|
||||||
|
} else if (tile_m == 128) {
|
||||||
|
if (use_tma_epilogue) {
|
||||||
|
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
||||||
|
OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
|
||||||
|
Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
|
||||||
|
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
|
||||||
|
out, a, b, a_scales, b_scales);
|
||||||
|
} else {
|
||||||
|
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
||||||
|
OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
|
||||||
|
Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
|
||||||
|
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
|
||||||
|
out, a, b, a_scales, b_scales);
|
||||||
|
}
|
||||||
|
} else { // tile_m == 256
|
||||||
|
if (use_tma_epilogue) {
|
||||||
|
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
||||||
|
OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
|
||||||
|
Shape<_2, _1, _1>, cutlass::epilogue::TmaWarpSpecialized2Sm,
|
||||||
|
cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
|
||||||
|
out, a, b, a_scales, b_scales);
|
||||||
|
} else {
|
||||||
|
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
||||||
|
OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
|
||||||
|
Shape<_2, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized2Sm,
|
||||||
|
cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
|
||||||
|
out, a, b, a_scales, b_scales);
|
||||||
|
}
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
|
// TODO: Test more tile N configs
|
||||||
|
constexpr int TILE_M = 128;
|
||||||
|
constexpr int TILE_N = 16;
|
||||||
|
// TMA epilogue isn't compatible with Swap A/B
|
||||||
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
||||||
OutType, Shape<_128, _128, _128>, Shape<_128, _1, _1>,
|
OutType, TILE_M, 1, TILE_K, Shape<Int<TILE_M>, Int<TILE_N>, Int<TILE_K>>,
|
||||||
Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
|
Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
|
||||||
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
|
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100, true>>(
|
||||||
out, a, b, a_scales, b_scales);
|
out, a, b, a_scales, b_scales);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ using c3x::cutlass_gemm_caller;
|
|||||||
template <typename InType, typename OutType,
|
template <typename InType, typename OutType,
|
||||||
template <typename, typename, typename> typename Epilogue>
|
template <typename, typename, typename> typename Epilogue>
|
||||||
struct sm100_fp8_config_default {
|
struct sm100_fp8_config_default {
|
||||||
|
// M in (128, inf)
|
||||||
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
||||||
using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
|
using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
|
||||||
using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
|
using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
|
||||||
@@ -25,6 +26,34 @@ struct sm100_fp8_config_default {
|
|||||||
KernelSchedule, EpilogueSchedule>;
|
KernelSchedule, EpilogueSchedule>;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template <typename InType, typename OutType,
|
||||||
|
template <typename, typename, typename> typename Epilogue>
|
||||||
|
struct sm100_fp8_config_M128 {
|
||||||
|
// M in (64, 128]
|
||||||
|
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
||||||
|
using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
|
||||||
|
using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
|
||||||
|
using TileShape = Shape<_128, _128, _64>;
|
||||||
|
using ClusterShape = Shape<_2, _2, _1>;
|
||||||
|
using Cutlass3xGemm =
|
||||||
|
cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
|
||||||
|
KernelSchedule, EpilogueSchedule>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename InType, typename OutType,
|
||||||
|
template <typename, typename, typename> typename Epilogue>
|
||||||
|
struct sm100_fp8_config_M64 {
|
||||||
|
// M in [1, 64]
|
||||||
|
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
||||||
|
using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
|
||||||
|
using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
|
||||||
|
using TileShape = Shape<_64, _64, _256>;
|
||||||
|
using ClusterShape = Shape<_1, _8, _1>;
|
||||||
|
using Cutlass3xGemm =
|
||||||
|
cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
|
||||||
|
KernelSchedule, EpilogueSchedule>;
|
||||||
|
};
|
||||||
|
|
||||||
template <typename InType, typename OutType,
|
template <typename InType, typename OutType,
|
||||||
template <typename, typename, typename> typename Epilogue,
|
template <typename, typename, typename> typename Epilogue,
|
||||||
typename... EpilogueArgs>
|
typename... EpilogueArgs>
|
||||||
@@ -39,8 +68,28 @@ inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out,
|
|||||||
using Cutlass3xGemmDefault =
|
using Cutlass3xGemmDefault =
|
||||||
typename sm100_fp8_config_default<InType, OutType,
|
typename sm100_fp8_config_default<InType, OutType,
|
||||||
Epilogue>::Cutlass3xGemm;
|
Epilogue>::Cutlass3xGemm;
|
||||||
return cutlass_gemm_caller<Cutlass3xGemmDefault>(
|
using Cutlass3xGemmM64 =
|
||||||
out, a, b, std::forward<EpilogueArgs>(args)...);
|
typename sm100_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
|
||||||
|
using Cutlass3xGemmM128 =
|
||||||
|
typename sm100_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
|
||||||
|
|
||||||
|
uint32_t const m = a.size(0);
|
||||||
|
uint32_t const mp2 =
|
||||||
|
std::max(static_cast<uint32_t>(64), next_pow_2(m)); // next power of 2
|
||||||
|
|
||||||
|
if (mp2 <= 64) {
|
||||||
|
// m in [1, 64]
|
||||||
|
return cutlass_gemm_caller<Cutlass3xGemmM64>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||||
|
} else if (mp2 <= 128) {
|
||||||
|
// m in (64, 128]
|
||||||
|
return cutlass_gemm_caller<Cutlass3xGemmM128>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||||
|
} else {
|
||||||
|
// m in (128, inf)
|
||||||
|
return cutlass_gemm_caller<Cutlass3xGemmDefault>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <template <typename, typename, typename> typename Epilogue,
|
template <template <typename, typename, typename> typename Epilogue,
|
||||||
|
|||||||
@@ -84,7 +84,8 @@ void run_cutlass_moe_mm_sm90(
|
|||||||
torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
|
torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
|
||||||
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
|
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
|
||||||
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
|
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
|
||||||
torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
|
torch::Tensor const& b_strides, torch::Tensor const& c_strides,
|
||||||
|
bool per_act_token, bool per_out_ch) {
|
||||||
TORCH_CHECK(a_tensors.size(0) > 0, "No input A tensors provided.");
|
TORCH_CHECK(a_tensors.size(0) > 0, "No input A tensors provided.");
|
||||||
TORCH_CHECK(b_tensors.size(0) > 0, "No input B tensors provided.");
|
TORCH_CHECK(b_tensors.size(0) > 0, "No input B tensors provided.");
|
||||||
TORCH_CHECK(out_tensors.size(0) > 0, "No output tensors provided.");
|
TORCH_CHECK(out_tensors.size(0) > 0, "No output tensors provided.");
|
||||||
@@ -113,19 +114,23 @@ void run_cutlass_moe_mm_sm90(
|
|||||||
if (n >= 8192) {
|
if (n >= 8192) {
|
||||||
cutlass_group_gemm_caller<Cutlass3xGemmN8192>(
|
cutlass_group_gemm_caller<Cutlass3xGemmN8192>(
|
||||||
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
|
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
|
||||||
problem_sizes, a_strides, b_strides, c_strides);
|
problem_sizes, a_strides, b_strides, c_strides, per_act_token,
|
||||||
|
per_out_ch);
|
||||||
} else if (k >= 8192) {
|
} else if (k >= 8192) {
|
||||||
cutlass_group_gemm_caller<Cutlass3xGemmK8192>(
|
cutlass_group_gemm_caller<Cutlass3xGemmK8192>(
|
||||||
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
|
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
|
||||||
problem_sizes, a_strides, b_strides, c_strides);
|
problem_sizes, a_strides, b_strides, c_strides, per_act_token,
|
||||||
|
per_out_ch);
|
||||||
} else if (m <= 16) {
|
} else if (m <= 16) {
|
||||||
cutlass_group_gemm_caller<Cutlass3xGemmM16>(
|
cutlass_group_gemm_caller<Cutlass3xGemmM16>(
|
||||||
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
|
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
|
||||||
problem_sizes, a_strides, b_strides, c_strides);
|
problem_sizes, a_strides, b_strides, c_strides, per_act_token,
|
||||||
|
per_out_ch);
|
||||||
} else {
|
} else {
|
||||||
cutlass_group_gemm_caller<Cutlass3xGemmDefault>(
|
cutlass_group_gemm_caller<Cutlass3xGemmDefault>(
|
||||||
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
|
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
|
||||||
problem_sizes, a_strides, b_strides, c_strides);
|
problem_sizes, a_strides, b_strides, c_strides, per_act_token,
|
||||||
|
per_out_ch);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -134,15 +139,18 @@ void dispatch_moe_mm_sm90(
|
|||||||
torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
|
torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
|
||||||
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
|
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
|
||||||
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
|
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
|
||||||
torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
|
torch::Tensor const& b_strides, torch::Tensor const& c_strides,
|
||||||
|
bool per_act_token, bool per_out_ch) {
|
||||||
if (out_tensors.dtype() == torch::kBFloat16) {
|
if (out_tensors.dtype() == torch::kBFloat16) {
|
||||||
run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::bfloat16_t>(
|
run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::bfloat16_t>(
|
||||||
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
|
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
|
||||||
problem_sizes, a_strides, b_strides, c_strides);
|
problem_sizes, a_strides, b_strides, c_strides, per_act_token,
|
||||||
|
per_out_ch);
|
||||||
} else {
|
} else {
|
||||||
run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::half_t>(
|
run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::half_t>(
|
||||||
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
|
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
|
||||||
problem_sizes, a_strides, b_strides, c_strides);
|
problem_sizes, a_strides, b_strides, c_strides, per_act_token,
|
||||||
|
per_out_ch);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -153,8 +161,9 @@ void cutlass_moe_mm_sm90(
|
|||||||
torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
|
torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
|
||||||
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
|
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
|
||||||
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
|
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
|
||||||
torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
|
torch::Tensor const& b_strides, torch::Tensor const& c_strides,
|
||||||
|
bool per_act_token, bool per_out_ch) {
|
||||||
dispatch_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
|
dispatch_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
|
||||||
expert_offsets, problem_sizes, a_strides, b_strides,
|
expert_offsets, problem_sizes, a_strides, b_strides,
|
||||||
c_strides);
|
c_strides, per_act_token, per_out_ch);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -76,7 +76,8 @@ void cutlass_group_gemm_caller(
|
|||||||
torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
|
torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
|
||||||
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
|
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
|
||||||
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
|
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
|
||||||
torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
|
torch::Tensor const& b_strides, torch::Tensor const& c_strides,
|
||||||
|
bool per_act_token, bool per_out_ch) {
|
||||||
using ElementAB = typename Gemm::ElementAB;
|
using ElementAB = typename Gemm::ElementAB;
|
||||||
using ElementD = typename Gemm::ElementD;
|
using ElementD = typename Gemm::ElementD;
|
||||||
|
|
||||||
@@ -84,9 +85,6 @@ void cutlass_group_gemm_caller(
|
|||||||
int k_size = a_tensors.size(1);
|
int k_size = a_tensors.size(1);
|
||||||
int n_size = out_tensors.size(1);
|
int n_size = out_tensors.size(1);
|
||||||
|
|
||||||
bool per_act_token = a_scales.numel() != 1;
|
|
||||||
bool per_out_ch = b_scales.numel() != num_experts;
|
|
||||||
|
|
||||||
auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
|
auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
|
||||||
|
|
||||||
auto options_int =
|
auto options_int =
|
||||||
|
|||||||
@@ -7,7 +7,7 @@
|
|||||||
|
|
||||||
constexpr uint64_t THREADS_PER_EXPERT = 512;
|
constexpr uint64_t THREADS_PER_EXPERT = 512;
|
||||||
|
|
||||||
__global__ void compute_problem_sizes(const int* __restrict__ topk_ids,
|
__global__ void compute_problem_sizes(const uint32_t* __restrict__ topk_ids,
|
||||||
int32_t* problem_sizes1,
|
int32_t* problem_sizes1,
|
||||||
int32_t* problem_sizes2,
|
int32_t* problem_sizes2,
|
||||||
int32_t* atomic_buffer,
|
int32_t* atomic_buffer,
|
||||||
@@ -45,7 +45,24 @@ __global__ void compute_expert_offsets(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void compute_arg_sorts(const int* __restrict__ topk_ids,
|
__global__ void compute_expert_blockscale_offsets(
|
||||||
|
const int32_t* __restrict__ problem_sizes1, int32_t* expert_offsets,
|
||||||
|
int32_t* blockscale_offsets, int32_t* atomic_buffer,
|
||||||
|
const int num_experts) {
|
||||||
|
int32_t tot_offset = 0;
|
||||||
|
int32_t tot_offset_round = 0;
|
||||||
|
expert_offsets[0] = 0;
|
||||||
|
blockscale_offsets[0] = 0;
|
||||||
|
for (int i = 0; i < num_experts; ++i) {
|
||||||
|
atomic_buffer[i] = tot_offset;
|
||||||
|
tot_offset += problem_sizes1[i * 3];
|
||||||
|
expert_offsets[i + 1] = tot_offset;
|
||||||
|
tot_offset_round += (problem_sizes1[i * 3] + (128 - 1)) / 128 * 128;
|
||||||
|
blockscale_offsets[i + 1] = tot_offset_round;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void compute_arg_sorts(const uint32_t* __restrict__ topk_ids,
|
||||||
const int32_t* __restrict__ expert_offsets,
|
const int32_t* __restrict__ expert_offsets,
|
||||||
int32_t* input_permutation,
|
int32_t* input_permutation,
|
||||||
int32_t* output_permutation,
|
int32_t* output_permutation,
|
||||||
@@ -77,7 +94,8 @@ void get_cutlass_moe_mm_data_caller(
|
|||||||
const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
|
const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
|
||||||
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
|
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
|
||||||
torch::Tensor& input_permutation, torch::Tensor& output_permutation,
|
torch::Tensor& input_permutation, torch::Tensor& output_permutation,
|
||||||
const int64_t num_experts, const int64_t n, const int64_t k) {
|
const int64_t num_experts, const int64_t n, const int64_t k,
|
||||||
|
const std::optional<torch::Tensor>& blockscale_offsets) {
|
||||||
auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
|
auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
|
||||||
auto options_int32 =
|
auto options_int32 =
|
||||||
torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
|
torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
|
||||||
@@ -85,19 +103,61 @@ void get_cutlass_moe_mm_data_caller(
|
|||||||
|
|
||||||
int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel());
|
int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel());
|
||||||
compute_problem_sizes<<<num_experts, num_threads, 0, stream>>>(
|
compute_problem_sizes<<<num_experts, num_threads, 0, stream>>>(
|
||||||
static_cast<const int32_t*>(topk_ids.data_ptr()),
|
static_cast<const uint32_t*>(topk_ids.data_ptr()),
|
||||||
static_cast<int32_t*>(problem_sizes1.data_ptr()),
|
static_cast<int32_t*>(problem_sizes1.data_ptr()),
|
||||||
static_cast<int32_t*>(problem_sizes2.data_ptr()),
|
static_cast<int32_t*>(problem_sizes2.data_ptr()),
|
||||||
static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n, k);
|
static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n, k);
|
||||||
compute_expert_offsets<<<1, 1, 0, stream>>>(
|
if (blockscale_offsets.has_value()) {
|
||||||
static_cast<const int32_t*>(problem_sizes1.data_ptr()),
|
compute_expert_blockscale_offsets<<<1, 1, 0, stream>>>(
|
||||||
static_cast<int32_t*>(expert_offsets.data_ptr()),
|
static_cast<const int32_t*>(problem_sizes1.data_ptr()),
|
||||||
static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
|
static_cast<int32_t*>(expert_offsets.data_ptr()),
|
||||||
|
static_cast<int32_t*>(blockscale_offsets.value().data_ptr()),
|
||||||
|
static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
|
||||||
|
} else {
|
||||||
|
compute_expert_offsets<<<1, 1, 0, stream>>>(
|
||||||
|
static_cast<const int32_t*>(problem_sizes1.data_ptr()),
|
||||||
|
static_cast<int32_t*>(expert_offsets.data_ptr()),
|
||||||
|
static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
|
||||||
|
}
|
||||||
compute_arg_sorts<<<num_experts, num_threads, 0, stream>>>(
|
compute_arg_sorts<<<num_experts, num_threads, 0, stream>>>(
|
||||||
static_cast<const int32_t*>(topk_ids.data_ptr()),
|
static_cast<const uint32_t*>(topk_ids.data_ptr()),
|
||||||
static_cast<const int32_t*>(expert_offsets.data_ptr()),
|
static_cast<const int32_t*>(expert_offsets.data_ptr()),
|
||||||
static_cast<int32_t*>(input_permutation.data_ptr()),
|
static_cast<int32_t*>(input_permutation.data_ptr()),
|
||||||
static_cast<int32_t*>(output_permutation.data_ptr()),
|
static_cast<int32_t*>(output_permutation.data_ptr()),
|
||||||
static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(),
|
static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(),
|
||||||
topk_ids.size(1));
|
topk_ids.size(1));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__global__ void compute_pplx_data(int32_t* expert_offsets,
|
||||||
|
int32_t* problem_sizes1,
|
||||||
|
int32_t* problem_sizes2,
|
||||||
|
const int32_t* __restrict__ expert_num_tokens,
|
||||||
|
const int padded_m, const int n,
|
||||||
|
const int k) {
|
||||||
|
int expert_idx = threadIdx.x;
|
||||||
|
|
||||||
|
expert_offsets[expert_idx] = expert_idx * padded_m;
|
||||||
|
problem_sizes1[expert_idx * 3] = expert_num_tokens[expert_idx];
|
||||||
|
problem_sizes1[expert_idx * 3 + 1] = 2 * n;
|
||||||
|
problem_sizes1[expert_idx * 3 + 2] = k;
|
||||||
|
problem_sizes2[expert_idx * 3] = expert_num_tokens[expert_idx];
|
||||||
|
problem_sizes2[expert_idx * 3 + 1] = k;
|
||||||
|
problem_sizes2[expert_idx * 3 + 2] = n;
|
||||||
|
}
|
||||||
|
|
||||||
|
void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
|
||||||
|
torch::Tensor& problem_sizes1,
|
||||||
|
torch::Tensor& problem_sizes2,
|
||||||
|
const torch::Tensor& expert_num_tokens,
|
||||||
|
const int64_t num_local_experts,
|
||||||
|
const int64_t padded_m,
|
||||||
|
const int64_t n, const int64_t k) {
|
||||||
|
auto stream = at::cuda::getCurrentCUDAStream(expert_offsets.device().index());
|
||||||
|
|
||||||
|
compute_pplx_data<<<1, num_local_experts, 0, stream>>>(
|
||||||
|
static_cast<int32_t*>(expert_offsets.data_ptr()),
|
||||||
|
static_cast<int32_t*>(problem_sizes1.data_ptr()),
|
||||||
|
static_cast<int32_t*>(problem_sizes2.data_ptr()),
|
||||||
|
static_cast<const int32_t*>(expert_num_tokens.data_ptr()), padded_m, n,
|
||||||
|
k);
|
||||||
|
}
|
||||||
|
|||||||
@@ -36,7 +36,8 @@ void cutlass_moe_mm_sm90(
|
|||||||
torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
|
torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
|
||||||
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
|
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
|
||||||
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
|
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
|
||||||
torch::Tensor const& b_strides, torch::Tensor const& c_strides);
|
torch::Tensor const& b_strides, torch::Tensor const& c_strides,
|
||||||
|
bool per_act_token, bool per_out_ch);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@@ -54,7 +55,16 @@ void get_cutlass_moe_mm_data_caller(
|
|||||||
const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
|
const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
|
||||||
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
|
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
|
||||||
torch::Tensor& input_permutation, torch::Tensor& output_permutation,
|
torch::Tensor& input_permutation, torch::Tensor& output_permutation,
|
||||||
const int64_t num_experts, const int64_t n, const int64_t k);
|
const int64_t num_experts, const int64_t n, const int64_t k,
|
||||||
|
const std::optional<torch::Tensor>& blockscale_offsets);
|
||||||
|
|
||||||
|
void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
|
||||||
|
torch::Tensor& problem_sizes1,
|
||||||
|
torch::Tensor& problem_sizes2,
|
||||||
|
const torch::Tensor& expert_num_tokens,
|
||||||
|
const int64_t num_local_experts,
|
||||||
|
const int64_t padded_m,
|
||||||
|
const int64_t n, const int64_t k);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a,
|
void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a,
|
||||||
@@ -206,12 +216,13 @@ void cutlass_moe_mm(
|
|||||||
torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
|
torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
|
||||||
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
|
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
|
||||||
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
|
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
|
||||||
torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
|
torch::Tensor const& b_strides, torch::Tensor const& c_strides,
|
||||||
|
bool per_act_token, bool per_out_ch) {
|
||||||
int32_t version_num = get_sm_version_num();
|
int32_t version_num = get_sm_version_num();
|
||||||
#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
|
#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
|
||||||
cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
|
cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
|
||||||
expert_offsets, problem_sizes, a_strides, b_strides,
|
expert_offsets, problem_sizes, a_strides, b_strides,
|
||||||
c_strides);
|
c_strides, per_act_token, per_out_ch);
|
||||||
return;
|
return;
|
||||||
#endif
|
#endif
|
||||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||||
@@ -224,7 +235,8 @@ void get_cutlass_moe_mm_data(
|
|||||||
const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
|
const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
|
||||||
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
|
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
|
||||||
torch::Tensor& input_permutation, torch::Tensor& output_permutation,
|
torch::Tensor& input_permutation, torch::Tensor& output_permutation,
|
||||||
const int64_t num_experts, const int64_t n, const int64_t k) {
|
const int64_t num_experts, const int64_t n, const int64_t k,
|
||||||
|
const std::optional<torch::Tensor>& blockscale_offsets) {
|
||||||
// This function currently gets compiled only if we have a valid cutlass moe
|
// This function currently gets compiled only if we have a valid cutlass moe
|
||||||
// mm to run it for.
|
// mm to run it for.
|
||||||
int32_t version_num = get_sm_version_num();
|
int32_t version_num = get_sm_version_num();
|
||||||
@@ -232,7 +244,8 @@ void get_cutlass_moe_mm_data(
|
|||||||
(defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM90)
|
(defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM90)
|
||||||
get_cutlass_moe_mm_data_caller(topk_ids, expert_offsets, problem_sizes1,
|
get_cutlass_moe_mm_data_caller(topk_ids, expert_offsets, problem_sizes1,
|
||||||
problem_sizes2, input_permutation,
|
problem_sizes2, input_permutation,
|
||||||
output_permutation, num_experts, n, k);
|
output_permutation, num_experts, n, k,
|
||||||
|
blockscale_offsets);
|
||||||
return;
|
return;
|
||||||
#endif
|
#endif
|
||||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||||
@@ -242,6 +255,29 @@ void get_cutlass_moe_mm_data(
|
|||||||
version_num, ". Required capability: 90");
|
version_num, ". Required capability: 90");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
|
||||||
|
torch::Tensor& problem_sizes1,
|
||||||
|
torch::Tensor& problem_sizes2,
|
||||||
|
const torch::Tensor& expert_num_tokens,
|
||||||
|
const int64_t num_local_experts,
|
||||||
|
const int64_t padded_m, const int64_t n,
|
||||||
|
const int64_t k) {
|
||||||
|
// This function currently gets compiled only if we have a valid cutlass moe
|
||||||
|
// mm to run it for.
|
||||||
|
int32_t version_num = get_sm_version_num();
|
||||||
|
#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
|
||||||
|
get_cutlass_pplx_moe_mm_data_caller(expert_offsets, problem_sizes1,
|
||||||
|
problem_sizes2, expert_num_tokens,
|
||||||
|
num_local_experts, padded_m, n, k);
|
||||||
|
return;
|
||||||
|
#endif
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||||
|
false,
|
||||||
|
"No compiled get_cutlass_pplx_moe_mm_data: no cutlass_scaled_mm kernel "
|
||||||
|
"for CUDA device capability: ",
|
||||||
|
version_num, ". Required capability: 90");
|
||||||
|
}
|
||||||
|
|
||||||
void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
|
void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
|
||||||
torch::Tensor const& b,
|
torch::Tensor const& b,
|
||||||
torch::Tensor const& a_scales,
|
torch::Tensor const& a_scales,
|
||||||
|
|||||||
@@ -39,8 +39,8 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
|
|||||||
fp8_type* __restrict__ token_output = &out[offset];
|
fp8_type* __restrict__ token_output = &out[offset];
|
||||||
|
|
||||||
// For vectorization, token_input and token_output pointers need to be
|
// For vectorization, token_input and token_output pointers need to be
|
||||||
// aligned at 8-byte and 4-byte addresses respectively.
|
// aligned at 32-byte and 16-byte addresses respectively.
|
||||||
bool const can_vectorize = hidden_size % 4 == 0;
|
bool const can_vectorize = hidden_size % 16 == 0;
|
||||||
|
|
||||||
float absmax_val = 0.0f;
|
float absmax_val = 0.0f;
|
||||||
if (can_vectorize) {
|
if (can_vectorize) {
|
||||||
@@ -48,24 +48,24 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
|
|||||||
} else {
|
} else {
|
||||||
for (int i = tid; i < hidden_size; i += blockDim.x) {
|
for (int i = tid; i < hidden_size; i += blockDim.x) {
|
||||||
float const x = static_cast<float>(token_input[i]);
|
float const x = static_cast<float>(token_input[i]);
|
||||||
absmax_val = max(absmax_val, fabs(x));
|
absmax_val = fmaxf(absmax_val, fabsf(x));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
using BlockReduce = cub::BlockReduce<float, 256>;
|
||||||
__shared__ typename BlockReduce::TempStorage reduceStorage;
|
__shared__ typename BlockReduce::TempStorage reduceStorage;
|
||||||
float const block_absmax_val_maybe =
|
float const block_absmax_val_maybe =
|
||||||
BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x);
|
BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x);
|
||||||
__shared__ float token_scale;
|
__shared__ float token_scale;
|
||||||
if (tid == 0) {
|
if (tid == 0) {
|
||||||
if (scale_ub) {
|
if (scale_ub) {
|
||||||
token_scale = min(block_absmax_val_maybe, *scale_ub);
|
token_scale = fminf(block_absmax_val_maybe, *scale_ub);
|
||||||
} else {
|
} else {
|
||||||
token_scale = block_absmax_val_maybe;
|
token_scale = block_absmax_val_maybe;
|
||||||
}
|
}
|
||||||
// token scale computation
|
// token scale computation
|
||||||
token_scale = max(token_scale / quant_type_max_v<fp8_type>,
|
token_scale = fmaxf(token_scale / quant_type_max_v<fp8_type>,
|
||||||
min_scaling_factor<fp8_type>::val());
|
min_scaling_factor<fp8_type>::val());
|
||||||
scale[token_idx] = token_scale;
|
scale[token_idx] = token_scale;
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
@@ -88,10 +88,11 @@ void static_scaled_fp8_quant(torch::Tensor& out, // [..., d]
|
|||||||
torch::Tensor const& input, // [..., d]
|
torch::Tensor const& input, // [..., d]
|
||||||
torch::Tensor const& scale) // [1]
|
torch::Tensor const& scale) // [1]
|
||||||
{
|
{
|
||||||
int64_t num_tokens = input.numel() / input.size(-1);
|
int const block_size = 256;
|
||||||
int64_t num_elems = input.numel();
|
int const num_tokens = input.numel() / input.size(-1);
|
||||||
dim3 grid(num_tokens);
|
int const num_elems = input.numel();
|
||||||
dim3 block(1024);
|
dim3 const grid(num_tokens);
|
||||||
|
dim3 const block(block_size);
|
||||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
VLLM_DISPATCH_FLOATING_TYPES(
|
VLLM_DISPATCH_FLOATING_TYPES(
|
||||||
@@ -110,10 +111,11 @@ void dynamic_scaled_fp8_quant(torch::Tensor& out, // [..., d]
|
|||||||
torch::Tensor const& input, // [..., d]
|
torch::Tensor const& input, // [..., d]
|
||||||
torch::Tensor& scale) // [1]
|
torch::Tensor& scale) // [1]
|
||||||
{
|
{
|
||||||
int64_t num_tokens = input.numel() / input.size(-1);
|
int const block_size = 256;
|
||||||
int64_t num_elems = input.numel();
|
int const num_tokens = input.numel() / input.size(-1);
|
||||||
dim3 grid(num_tokens);
|
int const num_elems = input.numel();
|
||||||
dim3 block(1024);
|
dim3 const grid(num_tokens);
|
||||||
|
dim3 const block(block_size);
|
||||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
VLLM_DISPATCH_FLOATING_TYPES(
|
VLLM_DISPATCH_FLOATING_TYPES(
|
||||||
@@ -141,8 +143,9 @@ void dynamic_per_token_scaled_fp8_quant(
|
|||||||
|
|
||||||
int const hidden_size = input.size(-1);
|
int const hidden_size = input.size(-1);
|
||||||
int const num_tokens = input.numel() / hidden_size;
|
int const num_tokens = input.numel() / hidden_size;
|
||||||
|
int const block_size = 256;
|
||||||
dim3 const grid(num_tokens);
|
dim3 const grid(num_tokens);
|
||||||
dim3 const block(std::min(hidden_size, 1024));
|
dim3 const block(std::min(hidden_size, block_size));
|
||||||
|
|
||||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
|||||||
@@ -46,7 +46,7 @@ __device__ __forceinline__ fp8_type scaled_fp8_conversion(float const val,
|
|||||||
}
|
}
|
||||||
|
|
||||||
float r =
|
float r =
|
||||||
fmax(-quant_type_max_v<fp8_type>, fmin(x, quant_type_max_v<fp8_type>));
|
fmaxf(-quant_type_max_v<fp8_type>, fminf(x, quant_type_max_v<fp8_type>));
|
||||||
#ifndef USE_ROCM
|
#ifndef USE_ROCM
|
||||||
return static_cast<fp8_type>(r);
|
return static_cast<fp8_type>(r);
|
||||||
#else
|
#else
|
||||||
@@ -65,7 +65,7 @@ template <typename scalar_t, typename fp8_type>
|
|||||||
__global__ void segmented_max_reduction(float* __restrict__ scale,
|
__global__ void segmented_max_reduction(float* __restrict__ scale,
|
||||||
const scalar_t* __restrict__ input,
|
const scalar_t* __restrict__ input,
|
||||||
int64_t num_elems) {
|
int64_t num_elems) {
|
||||||
__shared__ float cache[1024];
|
__shared__ float cache[256];
|
||||||
int64_t i = blockDim.x * blockIdx.x + threadIdx.x;
|
int64_t i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
// First store maximum for all values processes by
|
// First store maximum for all values processes by
|
||||||
@@ -73,7 +73,7 @@ __global__ void segmented_max_reduction(float* __restrict__ scale,
|
|||||||
scalar_t tmp = 0.0;
|
scalar_t tmp = 0.0;
|
||||||
while (i < num_elems) {
|
while (i < num_elems) {
|
||||||
float x = static_cast<float>(input[i]);
|
float x = static_cast<float>(input[i]);
|
||||||
tmp = max(tmp, fabs(x));
|
tmp = fmaxf(tmp, fabsf(x));
|
||||||
i += blockDim.x * gridDim.x;
|
i += blockDim.x * gridDim.x;
|
||||||
}
|
}
|
||||||
cache[threadIdx.x] = tmp;
|
cache[threadIdx.x] = tmp;
|
||||||
@@ -100,25 +100,27 @@ template <typename scalar_t>
|
|||||||
__device__ float thread_max_vec(scalar_t const* __restrict__ input,
|
__device__ float thread_max_vec(scalar_t const* __restrict__ input,
|
||||||
int64_t const num_elems, int const tid,
|
int64_t const num_elems, int const tid,
|
||||||
int const step) {
|
int const step) {
|
||||||
|
constexpr size_t VEC_SIZE = 16;
|
||||||
|
using scalarxN_t = vec_n_t<scalar_t, VEC_SIZE>;
|
||||||
// Vectorized input/output to better utilize memory bandwidth.
|
// Vectorized input/output to better utilize memory bandwidth.
|
||||||
vec4_t<scalar_t> const* vectorized_in =
|
auto const* vectorized_in = reinterpret_cast<scalarxN_t const*>(input);
|
||||||
reinterpret_cast<vec4_t<scalar_t> const*>(input);
|
|
||||||
|
|
||||||
int64_t const num_vec_elems = num_elems >> 2;
|
// num_elems / VEC_SIZE (which is 16)
|
||||||
|
int64_t const num_vec_elems = num_elems >> 4;
|
||||||
float absmax_val = 0.0f;
|
float absmax_val = 0.0f;
|
||||||
|
|
||||||
#pragma unroll 4
|
#pragma unroll
|
||||||
for (int64_t i = tid; i < num_vec_elems; i += step) {
|
for (int64_t i = tid; i < num_vec_elems; i += step) {
|
||||||
vec4_t<scalar_t> in_vec = vectorized_in[i];
|
scalarxN_t in_vec = vectorized_in[i];
|
||||||
absmax_val = max(absmax_val, fabs(in_vec.x));
|
#pragma unroll
|
||||||
absmax_val = max(absmax_val, fabs(in_vec.y));
|
for (int j = 0; j < VEC_SIZE; ++j) {
|
||||||
absmax_val = max(absmax_val, fabs(in_vec.z));
|
absmax_val = fmaxf(absmax_val, fabsf(in_vec.val[j]));
|
||||||
absmax_val = max(absmax_val, fabs(in_vec.w));
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Handle the remaining elements if num_elems is not divisible by 4
|
// Handle the remaining elements if num_elems is not divisible by VEC_SIZE
|
||||||
for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) {
|
for (int64_t i = num_vec_elems * VEC_SIZE + tid; i < num_elems; i += step) {
|
||||||
absmax_val = max(absmax_val, fabs(input[i]));
|
absmax_val = fmaxf(absmax_val, fabsf(input[i]));
|
||||||
}
|
}
|
||||||
|
|
||||||
return absmax_val;
|
return absmax_val;
|
||||||
@@ -130,31 +132,31 @@ __device__ void scaled_fp8_conversion_vec(fp8_type* __restrict__ out,
|
|||||||
float const scale,
|
float const scale,
|
||||||
int64_t const num_elems,
|
int64_t const num_elems,
|
||||||
int const tid, int const step) {
|
int const tid, int const step) {
|
||||||
using float8x4_t = q8x4_t<fp8_type>;
|
constexpr size_t VEC_SIZE = 16;
|
||||||
|
using scalarxN_t = vec_n_t<scalar_t, VEC_SIZE>;
|
||||||
|
using float8xN_t = q8_n_t<fp8_type, VEC_SIZE>;
|
||||||
// Vectorized input/output to better utilize memory bandwidth.
|
// Vectorized input/output to better utilize memory bandwidth.
|
||||||
auto const* vectorized_in = reinterpret_cast<vec4_t<scalar_t> const*>(input);
|
auto const* vectorized_in = reinterpret_cast<scalarxN_t const*>(input);
|
||||||
auto* vectorized_out = reinterpret_cast<float8x4_t*>(out);
|
auto* vectorized_out = reinterpret_cast<float8xN_t*>(out);
|
||||||
|
|
||||||
int64_t const num_vec_elems = num_elems >> 2;
|
// num_elems / VEC_SIZE (which is 16)
|
||||||
|
int64_t const num_vec_elems = num_elems >> 4;
|
||||||
|
|
||||||
#pragma unroll 4
|
#pragma unroll
|
||||||
for (int64_t i = tid; i < num_vec_elems; i += step) {
|
for (int64_t i = tid; i < num_vec_elems; i += step) {
|
||||||
vec4_t<scalar_t> in_vec = vectorized_in[i];
|
scalarxN_t in_vec = vectorized_in[i];
|
||||||
float8x4_t out_vec;
|
float8xN_t out_vec;
|
||||||
|
|
||||||
out_vec.x = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
|
#pragma unroll
|
||||||
static_cast<float>(in_vec.x), scale);
|
for (int j = 0; j < VEC_SIZE; ++j) {
|
||||||
out_vec.y = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
|
out_vec.val[j] = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
|
||||||
static_cast<float>(in_vec.y), scale);
|
static_cast<float>(in_vec.val[j]), scale);
|
||||||
out_vec.z = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
|
}
|
||||||
static_cast<float>(in_vec.z), scale);
|
|
||||||
out_vec.w = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
|
|
||||||
static_cast<float>(in_vec.w), scale);
|
|
||||||
vectorized_out[i] = out_vec;
|
vectorized_out[i] = out_vec;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Handle the remaining elements if num_elems is not divisible by 4
|
// Handle the remaining elements if num_elems is not divisible by VEC_SIZE
|
||||||
for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) {
|
for (int64_t i = num_vec_elems * VEC_SIZE + tid; i < num_elems; i += step) {
|
||||||
out[i] = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
|
out[i] = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
|
||||||
static_cast<float>(input[i]), scale);
|
static_cast<float>(input[i]), scale);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -140,6 +140,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
|
|||||||
// sum of squares
|
// sum of squares
|
||||||
float ss = 0.0f;
|
float ss = 0.0f;
|
||||||
|
|
||||||
|
const int VEC_SIZE = 4;
|
||||||
int32_t const num_vec_elems = hidden_size >> 2;
|
int32_t const num_vec_elems = hidden_size >> 2;
|
||||||
|
|
||||||
#pragma unroll 4
|
#pragma unroll 4
|
||||||
@@ -147,22 +148,23 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
|
|||||||
vec4_t<scalar_t> in = vec_input[i];
|
vec4_t<scalar_t> in = vec_input[i];
|
||||||
|
|
||||||
vec4_t<float> x;
|
vec4_t<float> x;
|
||||||
x.x = static_cast<float>(in.x);
|
#pragma unroll
|
||||||
x.y = static_cast<float>(in.y);
|
for (int j = 0; j < VEC_SIZE; ++j) {
|
||||||
x.z = static_cast<float>(in.z);
|
x.val[j] = static_cast<float>(in.val[j]);
|
||||||
x.w = static_cast<float>(in.w);
|
|
||||||
if constexpr (has_residual) {
|
|
||||||
vec4_t<scalar_t> r = vec_residual[i];
|
|
||||||
x.x += static_cast<float>(r.x);
|
|
||||||
x.y += static_cast<float>(r.y);
|
|
||||||
x.z += static_cast<float>(r.z);
|
|
||||||
x.w += static_cast<float>(r.w);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ss += x.x * x.x;
|
if constexpr (has_residual) {
|
||||||
ss += x.y * x.y;
|
vec4_t<scalar_t> r = vec_residual[i];
|
||||||
ss += x.z * x.z;
|
#pragma unroll
|
||||||
ss += x.w * x.w;
|
for (int j = 0; j < VEC_SIZE; ++j) {
|
||||||
|
x.val[j] += static_cast<float>(r.val[j]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < VEC_SIZE; ++j) {
|
||||||
|
ss += x.val[j] * x.val[j];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||||
@@ -203,6 +205,7 @@ __device__ void compute_dynamic_per_token_scales(
|
|||||||
|
|
||||||
constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
|
constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
|
||||||
|
|
||||||
|
const int VEC_SIZE = 4;
|
||||||
int32_t const num_vec_elems = hidden_size >> 2;
|
int32_t const num_vec_elems = hidden_size >> 2;
|
||||||
float block_absmax_val_maybe = 0.0f;
|
float block_absmax_val_maybe = 0.0f;
|
||||||
|
|
||||||
@@ -212,26 +215,25 @@ __device__ void compute_dynamic_per_token_scales(
|
|||||||
vec4_t<scalar_t> const w = vec_weight[i];
|
vec4_t<scalar_t> const w = vec_weight[i];
|
||||||
|
|
||||||
vec4_t<float> x;
|
vec4_t<float> x;
|
||||||
x.x = static_cast<float>(in.x);
|
#pragma unroll
|
||||||
x.y = static_cast<float>(in.y);
|
for (int j = 0; j < VEC_SIZE; ++j) {
|
||||||
x.z = static_cast<float>(in.z);
|
x.val[j] = static_cast<float>(in.val[j]);
|
||||||
x.w = static_cast<float>(in.w);
|
|
||||||
if constexpr (has_residual) {
|
|
||||||
vec4_t<scalar_t> r = vec_residual[i];
|
|
||||||
x.x += static_cast<float>(r.x);
|
|
||||||
x.y += static_cast<float>(r.y);
|
|
||||||
x.z += static_cast<float>(r.z);
|
|
||||||
x.w += static_cast<float>(r.w);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
block_absmax_val_maybe = fmaxf(
|
if constexpr (has_residual) {
|
||||||
block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.x * rms) * w.x));
|
vec4_t<scalar_t> r = vec_residual[i];
|
||||||
block_absmax_val_maybe = fmaxf(
|
#pragma unroll
|
||||||
block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.y * rms) * w.y));
|
for (int j = 0; j < VEC_SIZE; ++j) {
|
||||||
block_absmax_val_maybe = fmaxf(
|
x.val[j] += static_cast<float>(r.val[j]);
|
||||||
block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.z * rms) * w.z));
|
}
|
||||||
block_absmax_val_maybe = fmaxf(
|
}
|
||||||
block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.w * rms) * w.w));
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < VEC_SIZE; ++j) {
|
||||||
|
block_absmax_val_maybe =
|
||||||
|
fmaxf(block_absmax_val_maybe,
|
||||||
|
fabs(static_cast<scalar_t>(x.val[j] * rms) * w.val[j]));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||||
@@ -282,6 +284,7 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
|
|||||||
vec_residual = reinterpret_cast<vec4_t<scalar_t>*>(&residual[token_offset]);
|
vec_residual = reinterpret_cast<vec4_t<scalar_t>*>(&residual[token_offset]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const int VEC_SIZE = 4;
|
||||||
int32_t const num_vec_elems = hidden_size >> 2;
|
int32_t const num_vec_elems = hidden_size >> 2;
|
||||||
|
|
||||||
// TODO(luka/varun) extract into type-agnostic vectorized quant function to
|
// TODO(luka/varun) extract into type-agnostic vectorized quant function to
|
||||||
@@ -292,33 +295,31 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
|
|||||||
vec4_t<scalar_t> const w = vec_weight[i];
|
vec4_t<scalar_t> const w = vec_weight[i];
|
||||||
|
|
||||||
vec4_t<float> x;
|
vec4_t<float> x;
|
||||||
x.x = static_cast<float>(in.x);
|
#pragma unroll
|
||||||
x.y = static_cast<float>(in.y);
|
for (int j = 0; j < VEC_SIZE; ++j) {
|
||||||
x.z = static_cast<float>(in.z);
|
x.val[j] = static_cast<float>(in.val[j]);
|
||||||
x.w = static_cast<float>(in.w);
|
}
|
||||||
|
|
||||||
if constexpr (has_residual) {
|
if constexpr (has_residual) {
|
||||||
vec4_t<scalar_t> r = vec_residual[i];
|
vec4_t<scalar_t> r = vec_residual[i];
|
||||||
x.x += static_cast<float>(r.x);
|
#pragma unroll
|
||||||
x.y += static_cast<float>(r.y);
|
for (int j = 0; j < VEC_SIZE; ++j) {
|
||||||
x.z += static_cast<float>(r.z);
|
x.val[j] += static_cast<float>(r.val[j]);
|
||||||
x.w += static_cast<float>(r.w);
|
}
|
||||||
// Update residual
|
// Update residual
|
||||||
r.x = static_cast<scalar_t>(x.x);
|
#pragma unroll
|
||||||
r.y = static_cast<scalar_t>(x.y);
|
for (int j = 0; j < VEC_SIZE; ++j) {
|
||||||
r.z = static_cast<scalar_t>(x.z);
|
r.val[j] = static_cast<scalar_t>(x.val[j]);
|
||||||
r.w = static_cast<scalar_t>(x.w);
|
}
|
||||||
vec_residual[i] = r;
|
vec_residual[i] = r;
|
||||||
}
|
}
|
||||||
|
|
||||||
q8x4_t<scalar_out_t> out;
|
q8x4_t<scalar_out_t> out;
|
||||||
out.x = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
|
#pragma unroll
|
||||||
static_cast<scalar_t>(x.x * rms) * w.x, scale);
|
for (int j = 0; j < VEC_SIZE; ++j) {
|
||||||
out.y = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
|
out.val[j] = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
|
||||||
static_cast<scalar_t>(x.y * rms) * w.y, scale);
|
static_cast<scalar_t>(x.val[j] * rms) * w.val[j], scale);
|
||||||
out.z = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
|
}
|
||||||
static_cast<scalar_t>(x.z * rms) * w.z, scale);
|
|
||||||
out.w = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
|
|
||||||
static_cast<scalar_t>(x.w * rms) * w.w, scale);
|
|
||||||
vec_output[i] = out;
|
vec_output[i] = out;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import glob
|
import glob
|
||||||
import itertools
|
import itertools
|
||||||
import os
|
import os
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import itertools
|
import itertools
|
||||||
import math
|
import math
|
||||||
|
|||||||
@@ -10,23 +10,22 @@
|
|||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
// Vectorization containers
|
// Vectorization containers
|
||||||
template <typename scalar_t>
|
template <typename scalar_t, size_t vec_size>
|
||||||
struct __align__(8) vec4_t {
|
struct __align__(vec_size * sizeof(scalar_t)) vec_n_t {
|
||||||
scalar_t x;
|
scalar_t val[vec_size];
|
||||||
scalar_t y;
|
|
||||||
scalar_t z;
|
|
||||||
scalar_t w;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename quant_type_t>
|
template <typename quant_type_t, size_t vec_size>
|
||||||
struct __align__(4) q8x4_t {
|
struct __align__(vec_size * sizeof(quant_type_t)) q8_n_t {
|
||||||
static_assert(std::is_same_v<quant_type_t, int8_t> ||
|
static_assert(std::is_same_v<quant_type_t, int8_t> ||
|
||||||
std::is_same_v<quant_type_t, c10::Float8_e4m3fn> ||
|
std::is_same_v<quant_type_t, c10::Float8_e4m3fn> ||
|
||||||
std::is_same_v<quant_type_t, c10::Float8_e4m3fnuz>);
|
std::is_same_v<quant_type_t, c10::Float8_e4m3fnuz>);
|
||||||
quant_type_t x;
|
quant_type_t val[vec_size];
|
||||||
quant_type_t y;
|
|
||||||
quant_type_t z;
|
|
||||||
quant_type_t w;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template <typename scalar_t>
|
||||||
|
using vec4_t = vec_n_t<scalar_t, 4>;
|
||||||
|
template <typename quant_type_t>
|
||||||
|
using q8x4_t = q8_n_t<quant_type_t, 4>;
|
||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|||||||
@@ -13,14 +13,34 @@
|
|||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
#include "quantization/fp8/common.cuh"
|
#include "quantization/fp8/common.cuh"
|
||||||
|
|
||||||
#if defined(__HIPCC__) && (defined(__gfx90a__) || defined(__gfx942__))
|
#if defined(__HIPCC__) && \
|
||||||
#define __HIP__MI300_MI250__
|
(defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__))
|
||||||
|
#define __HIP__GFX9__
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__HIPCC__) && defined(__gfx942__)
|
#if defined(__HIPCC__) && (defined(__gfx942__) || defined(__gfx950__))
|
||||||
#define __HIP__MI300__
|
#define __HIP__MI3XX__
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(__gfx950__)
|
||||||
|
#define LDS_SIZE 160 * 1024
|
||||||
|
#else
|
||||||
|
#define LDS_SIZE 64 * 1024
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int get_lds_size() {
|
||||||
|
static bool is_cached = false;
|
||||||
|
static int result;
|
||||||
|
if (is_cached == false) {
|
||||||
|
auto dprops = at::cuda::getCurrentDeviceProperties();
|
||||||
|
std::string device_arch = dprops->gcnArchName;
|
||||||
|
size_t substring = device_arch.find("gfx95");
|
||||||
|
result = (substring == std::string::npos ? 64 * 1024 : 160 * 1024);
|
||||||
|
is_cached = true;
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
#if defined(NDEBUG)
|
#if defined(NDEBUG)
|
||||||
#undef NDEBUG
|
#undef NDEBUG
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
@@ -267,7 +287,7 @@ torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b,
|
|||||||
V0 += (s.x + s.y); \
|
V0 += (s.x + s.y); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(__HIP__MI300_MI250__) // TODO: Add NAVI support
|
#if defined(__HIP__GFX9__) // TODO: Add NAVI support
|
||||||
// This version targets cases where A[] fits LDS capacity
|
// This version targets cases where A[] fits LDS capacity
|
||||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||||
int UNRL, int N>
|
int UNRL, int N>
|
||||||
@@ -275,7 +295,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
|||||||
wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
|
wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
|
||||||
const scalar_t* __restrict__ A, scalar_t* C,
|
const scalar_t* __restrict__ A, scalar_t* C,
|
||||||
const int _WvPrGrp, const int CuCount) {
|
const int _WvPrGrp, const int CuCount) {
|
||||||
#if defined(__HIP__MI300__)
|
constexpr int max_lds_len = LDS_SIZE / 2;
|
||||||
|
#if defined(__HIP__MI3XX__)
|
||||||
constexpr bool use_mfma = (std::is_same_v<scalar_t, __hip_bfloat16>);
|
constexpr bool use_mfma = (std::is_same_v<scalar_t, __hip_bfloat16>);
|
||||||
#else
|
#else
|
||||||
constexpr bool use_mfma = false;
|
constexpr bool use_mfma = false;
|
||||||
@@ -295,13 +316,13 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
|||||||
};
|
};
|
||||||
|
|
||||||
//----------------------------------------------------
|
//----------------------------------------------------
|
||||||
// Reserving 64 KB of LDS to have 1 WG / CU
|
// Reserving 64/160 KB of LDS to have 1 WG / CU
|
||||||
// Goal is to bring the activation matrix A to the LDS
|
// Goal is to bring the activation matrix A to the LDS
|
||||||
// and use it across the lifetime of the work group
|
// and use it across the lifetime of the work group
|
||||||
// TODO: When activation matrix is larger than 64 KB
|
// TODO: When activation matrix is larger than 64 KB
|
||||||
// then this is not goint to work!
|
// then this is not goint to work!
|
||||||
//----------------------------------------------------
|
//----------------------------------------------------
|
||||||
__shared__ scalar_t s[1024 * 32];
|
__shared__ scalar_t s[max_lds_len];
|
||||||
|
|
||||||
//----------------------------------------------------
|
//----------------------------------------------------
|
||||||
// Fetch the activation matrix to LDS
|
// Fetch the activation matrix to LDS
|
||||||
@@ -312,11 +333,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
|||||||
// - Then the WG will move to another 8 K elements
|
// - Then the WG will move to another 8 K elements
|
||||||
// TODO: Logic below will only work when K is multiple of 8
|
// TODO: Logic below will only work when K is multiple of 8
|
||||||
//----------------------------------------------------
|
//----------------------------------------------------
|
||||||
for (uint32_t k = 0; k < min(K * N, 32 * 1024);
|
for (uint32_t k = 0; k < min(K * N, max_lds_len);
|
||||||
k += THRDS * WvPrGrp * A_CHUNK) {
|
k += THRDS * WvPrGrp * A_CHUNK) {
|
||||||
uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
|
uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
|
||||||
|
|
||||||
if (k_in >= min(K * N, 32 * 1024)) break;
|
if (k_in >= min(K * N, max_lds_len)) break;
|
||||||
|
|
||||||
*((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
|
*((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
|
||||||
}
|
}
|
||||||
@@ -517,7 +538,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
|||||||
m += CuCount * _WvPrGrp * YTILE;
|
m += CuCount * _WvPrGrp * YTILE;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support
|
#else // !defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||||
int UNRL, int N>
|
int UNRL, int N>
|
||||||
__global__ void wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
|
__global__ void wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
|
||||||
@@ -525,9 +546,9 @@ __global__ void wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
|
|||||||
const int _WvPrGrp, const int CuCount) {
|
const int _WvPrGrp, const int CuCount) {
|
||||||
UNREACHABLE_CODE
|
UNREACHABLE_CODE
|
||||||
}
|
}
|
||||||
#endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
|
#endif // defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||||
|
|
||||||
#if defined(__HIP__MI300_MI250__) // TODO: Add NAVI support
|
#if defined(__HIP__GFX9__) // TODO: Add NAVI support
|
||||||
// This version targets cases where A[] marginally exceeds LDS capacity
|
// This version targets cases where A[] marginally exceeds LDS capacity
|
||||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||||
int UNRL, int N>
|
int UNRL, int N>
|
||||||
@@ -535,7 +556,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
|||||||
wvSplitK_hf_(const int K, const int M, const scalar_t* B,
|
wvSplitK_hf_(const int K, const int M, const scalar_t* B,
|
||||||
const scalar_t* __restrict__ A, scalar_t* C,
|
const scalar_t* __restrict__ A, scalar_t* C,
|
||||||
const int _WvPrGrp, const int CuCount) {
|
const int _WvPrGrp, const int CuCount) {
|
||||||
#if defined(__HIP__MI300__)
|
constexpr int max_lds_len = LDS_SIZE / 2;
|
||||||
|
#if defined(__HIP__MI3XX__)
|
||||||
constexpr bool use_mfma = (std::is_same_v<scalar_t, __hip_bfloat16>);
|
constexpr bool use_mfma = (std::is_same_v<scalar_t, __hip_bfloat16>);
|
||||||
#else
|
#else
|
||||||
constexpr bool use_mfma = false;
|
constexpr bool use_mfma = false;
|
||||||
@@ -561,7 +583,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
|||||||
// TODO: When activation matrix is larger than 64 KB
|
// TODO: When activation matrix is larger than 64 KB
|
||||||
// then this is not goint to work!
|
// then this is not goint to work!
|
||||||
//----------------------------------------------------
|
//----------------------------------------------------
|
||||||
__shared__ scalar_t s[1024 * 32];
|
__shared__ scalar_t s[max_lds_len];
|
||||||
|
|
||||||
//----------------------------------------------------
|
//----------------------------------------------------
|
||||||
// Computation of columns that need to be committed to memory!
|
// Computation of columns that need to be committed to memory!
|
||||||
@@ -598,11 +620,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
|||||||
// - Then the WG will move to another 8 K elements
|
// - Then the WG will move to another 8 K elements
|
||||||
// TODO: Logic below will only work when K is multiple of 8
|
// TODO: Logic below will only work when K is multiple of 8
|
||||||
//----------------------------------------------------
|
//----------------------------------------------------
|
||||||
for (uint32_t k = 0; k < min(K * N, 32 * 1024);
|
for (uint32_t k = 0; k < min(K * N, max_lds_len);
|
||||||
k += THRDS * WvPrGrp * A_CHUNK) {
|
k += THRDS * WvPrGrp * A_CHUNK) {
|
||||||
uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
|
uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
|
||||||
|
|
||||||
if (k_in >= min(K * N, 32 * 1024)) break;
|
if (k_in >= min(K * N, max_lds_len)) break;
|
||||||
|
|
||||||
*((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
|
*((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
|
||||||
}
|
}
|
||||||
@@ -686,7 +708,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
|||||||
// Fetch A activation matrix in interleaved fashion from LDS or memory
|
// Fetch A activation matrix in interleaved fashion from LDS or memory
|
||||||
|
|
||||||
for (int n = 0; n < N; n++) {
|
for (int n = 0; n < N; n++) {
|
||||||
if (k_ + K * n < 32 * 1024)
|
if (k_ + K * n < max_lds_len)
|
||||||
bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
|
bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
|
||||||
else
|
else
|
||||||
bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n])));
|
bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n])));
|
||||||
@@ -817,7 +839,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#else // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support
|
#else // !defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||||
int UNRL, int N>
|
int UNRL, int N>
|
||||||
__global__ void wvSplitK_hf_(const int K, const int M, const scalar_t* B,
|
__global__ void wvSplitK_hf_(const int K, const int M, const scalar_t* B,
|
||||||
@@ -825,9 +847,9 @@ __global__ void wvSplitK_hf_(const int K, const int M, const scalar_t* B,
|
|||||||
const int _WvPrGrp, const int CuCount) {
|
const int _WvPrGrp, const int CuCount) {
|
||||||
UNREACHABLE_CODE
|
UNREACHABLE_CODE
|
||||||
}
|
}
|
||||||
#endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
|
#endif // defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||||
|
|
||||||
#if defined(__HIP__MI300_MI250__) // TODO: Add NAVI support
|
#if defined(__HIP__GFX9__) // TODO: Add NAVI support
|
||||||
// This version targets big A[] cases, where it is much larger than LDS capacity
|
// This version targets big A[] cases, where it is much larger than LDS capacity
|
||||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||||
int UNRL, int N>
|
int UNRL, int N>
|
||||||
@@ -835,7 +857,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
|||||||
wvSplitK_hf_big_(const int K, const int M, const scalar_t* B,
|
wvSplitK_hf_big_(const int K, const int M, const scalar_t* B,
|
||||||
const scalar_t* __restrict__ A, scalar_t* C,
|
const scalar_t* __restrict__ A, scalar_t* C,
|
||||||
const int _WvPrGrp, const int CuCount) {
|
const int _WvPrGrp, const int CuCount) {
|
||||||
#if defined(__HIP__MI300__)
|
constexpr int max_lds_len = LDS_SIZE / 2;
|
||||||
|
#if defined(__HIP__MI3XX__)
|
||||||
constexpr bool use_mfma = (std::is_same_v<scalar_t, __hip_bfloat16>);
|
constexpr bool use_mfma = (std::is_same_v<scalar_t, __hip_bfloat16>);
|
||||||
#else
|
#else
|
||||||
constexpr bool use_mfma = false;
|
constexpr bool use_mfma = false;
|
||||||
@@ -855,13 +878,13 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
|||||||
};
|
};
|
||||||
|
|
||||||
//----------------------------------------------------
|
//----------------------------------------------------
|
||||||
// Reserving 64 KB of LDS to have 1 WG / CU
|
// Reserving 64/160 KB of LDS to have 1 WG / CU
|
||||||
// Goal is to bring the activation matrix A to the LDS
|
// Goal is to bring the activation matrix A to the LDS
|
||||||
// and use it across the lifetime of the work group
|
// and use it across the lifetime of the work group
|
||||||
// TODO: When activation matrix is larger than 64 KB
|
// TODO: When activation matrix is larger than 64 KB
|
||||||
// then this is not goint to work!
|
// then this is not goint to work!
|
||||||
//----------------------------------------------------
|
//----------------------------------------------------
|
||||||
__shared__ scalar_t s[1024 * 32];
|
__shared__ scalar_t s[max_lds_len];
|
||||||
|
|
||||||
//----------------------------------------------------
|
//----------------------------------------------------
|
||||||
// Computation of columns that need to be committed to memory!
|
// Computation of columns that need to be committed to memory!
|
||||||
@@ -902,11 +925,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
|||||||
//----------------------------------------------------
|
//----------------------------------------------------
|
||||||
#define PCML
|
#define PCML
|
||||||
#ifndef PCML
|
#ifndef PCML
|
||||||
for (uint32_t k = 0; k < min(K * N, 32 * 1024);
|
for (uint32_t k = 0; k < min(K * N, max_lds_len);
|
||||||
k += THRDS * WvPrGrp * A_CHUNK) {
|
k += THRDS * WvPrGrp * A_CHUNK) {
|
||||||
uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
|
uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
|
||||||
|
|
||||||
if (k_in >= min(K * N, 32 * 1024)) break;
|
if (k_in >= min(K * N, max_lds_len)) break;
|
||||||
|
|
||||||
*((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
|
*((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
|
||||||
}
|
}
|
||||||
@@ -916,7 +939,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
|||||||
#define TUC (THRDS * UNRL * A_CHUNK)
|
#define TUC (THRDS * UNRL * A_CHUNK)
|
||||||
uint32_t kBase = 0;
|
uint32_t kBase = 0;
|
||||||
// find biggest k size that fits in LDS
|
// find biggest k size that fits in LDS
|
||||||
uint32_t kFit = (32 * 1024) / N;
|
uint32_t kFit = (max_lds_len) / N;
|
||||||
// kFit = (kFit%TWC==0) ? kFit : (kFit-kFit%TWC+TWC); //round up to multiple
|
// kFit = (kFit%TWC==0) ? kFit : (kFit-kFit%TWC+TWC); //round up to multiple
|
||||||
// of TUC
|
// of TUC
|
||||||
kFit = (kFit % TUC == 0)
|
kFit = (kFit % TUC == 0)
|
||||||
@@ -1164,7 +1187,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support
|
#else // !defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||||
int UNRL, int N>
|
int UNRL, int N>
|
||||||
__global__ void wvSplitK_hf_big_(const int K, const int M, const scalar_t* B,
|
__global__ void wvSplitK_hf_big_(const int K, const int M, const scalar_t* B,
|
||||||
@@ -1172,7 +1195,7 @@ __global__ void wvSplitK_hf_big_(const int K, const int M, const scalar_t* B,
|
|||||||
const int _WvPrGrp, const int CuCount) {
|
const int _WvPrGrp, const int CuCount) {
|
||||||
UNREACHABLE_CODE
|
UNREACHABLE_CODE
|
||||||
}
|
}
|
||||||
#endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
|
#endif // defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||||
|
|
||||||
int mindiv(int N, int div1, int div2) {
|
int mindiv(int N, int div1, int div2) {
|
||||||
int nPrRnd = div1 * div2;
|
int nPrRnd = div1 * div2;
|
||||||
@@ -1222,17 +1245,18 @@ torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b,
|
|||||||
|
|
||||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a));
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a));
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
const int max_lds_len = get_lds_size() / 2;
|
||||||
|
|
||||||
#define WVSPLITK(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \
|
#define WVSPLITK(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \
|
||||||
_N) \
|
_N) \
|
||||||
{ \
|
{ \
|
||||||
dim3 block(64, _WvPrGrp); \
|
dim3 block(64, _WvPrGrp); \
|
||||||
if ((K_in * N_in <= 32 * 1024) && (M_in % _YTILEs == 0)) { \
|
if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) { \
|
||||||
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp); \
|
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp); \
|
||||||
wvSplitK_hf_sml_<fptype, 64, _YTILEs, _WvPrGrp, 8, _UNRLs, _N> \
|
wvSplitK_hf_sml_<fptype, 64, _YTILEs, _WvPrGrp, 8, _UNRLs, _N> \
|
||||||
<<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp, \
|
<<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp, \
|
||||||
CuCount); \
|
CuCount); \
|
||||||
} else if (K_in * N_in <= 32 * 1024 * 1.2) { \
|
} else if (K_in * N_in <= max_lds_len * 1.2) { \
|
||||||
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp); \
|
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp); \
|
||||||
wvSplitK_hf_<fptype, 64, _YTILEm, _WvPrGrp, 8, _UNRLm, _N> \
|
wvSplitK_hf_<fptype, 64, _YTILEm, _WvPrGrp, 8, _UNRLm, _N> \
|
||||||
<<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp, \
|
<<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp, \
|
||||||
@@ -1272,7 +1296,7 @@ torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b,
|
|||||||
return out_c;
|
return out_c;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(__HIP__MI300__) // TODO: Add NAVI support
|
#if defined(__HIP__MI3XX__) // TODO: Add NAVI support
|
||||||
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
|
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
|
||||||
int A_CHUNK, int UNRL, int N>
|
int A_CHUNK, int UNRL, int N>
|
||||||
__global__ void __launch_bounds__(WvPrGrp* THRDS)
|
__global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||||
@@ -1281,6 +1305,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
|||||||
const float* __restrict__ s_A,
|
const float* __restrict__ s_A,
|
||||||
const float* __restrict__ s_B, const int _WvPrGrp,
|
const float* __restrict__ s_B, const int _WvPrGrp,
|
||||||
const int CuCount) {
|
const int CuCount) {
|
||||||
|
constexpr int max_lds_len = LDS_SIZE;
|
||||||
using scalar8 =
|
using scalar8 =
|
||||||
__attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float;
|
__attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float;
|
||||||
using intx2 = __attribute__((__vector_size__(2 * sizeof(int)))) int;
|
using intx2 = __attribute__((__vector_size__(2 * sizeof(int)))) int;
|
||||||
@@ -1296,10 +1321,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
|||||||
scalar8 h8;
|
scalar8 h8;
|
||||||
};
|
};
|
||||||
|
|
||||||
__shared__ fp8_t s[1024 * 64];
|
__shared__ fp8_t s[max_lds_len];
|
||||||
|
|
||||||
for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
|
for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
|
||||||
k < min(K * N, 64 * 1024); k += THRDS * WvPrGrp * A_CHUNK) {
|
k < min(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
|
||||||
*((bigType*)(&s[k])) = *((bigType*)(&A[k]));
|
*((bigType*)(&s[k])) = *((bigType*)(&A[k]));
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
@@ -1436,7 +1461,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
|||||||
m += CuCount * _WvPrGrp * YTILE;
|
m += CuCount * _WvPrGrp * YTILE;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else // !defined(__HIP__MI300__) TODO: Add NAVI support
|
#else // !defined(__HIP__MI3XX__) TODO: Add NAVI support
|
||||||
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
|
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
|
||||||
int A_CHUNK, int UNRL, int N>
|
int A_CHUNK, int UNRL, int N>
|
||||||
__global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M,
|
__global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M,
|
||||||
@@ -1446,9 +1471,9 @@ __global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M,
|
|||||||
const int _WvPrGrp, const int CuCount) {
|
const int _WvPrGrp, const int CuCount) {
|
||||||
UNREACHABLE_CODE
|
UNREACHABLE_CODE
|
||||||
}
|
}
|
||||||
#endif // defined(__HIP__MI300__) TODO: Add NAVI support
|
#endif // defined(__HIP__MI3XX__) TODO: Add NAVI support
|
||||||
|
|
||||||
#if defined(__HIP__MI300__) // TODO: Add NAVI support
|
#if defined(__HIP__MI3XX__) // TODO: Add NAVI support
|
||||||
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
|
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
|
||||||
int A_CHUNK, int UNRL, int N>
|
int A_CHUNK, int UNRL, int N>
|
||||||
__global__ void __launch_bounds__(WvPrGrp* THRDS)
|
__global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||||
@@ -1456,6 +1481,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
|||||||
const fp8_t* __restrict__ A, scalar_t* C,
|
const fp8_t* __restrict__ A, scalar_t* C,
|
||||||
const float* __restrict__ s_A, const float* __restrict__ s_B,
|
const float* __restrict__ s_A, const float* __restrict__ s_B,
|
||||||
const int _WvPrGrp, const int CuCount) {
|
const int _WvPrGrp, const int CuCount) {
|
||||||
|
constexpr int max_lds_len = LDS_SIZE;
|
||||||
using scalar8 =
|
using scalar8 =
|
||||||
__attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float;
|
__attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float;
|
||||||
using intx2 = __attribute__((__vector_size__(2 * sizeof(int)))) int;
|
using intx2 = __attribute__((__vector_size__(2 * sizeof(int)))) int;
|
||||||
@@ -1471,10 +1497,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
|||||||
scalar8 h8;
|
scalar8 h8;
|
||||||
};
|
};
|
||||||
|
|
||||||
__shared__ fp8_t s[1024 * 64];
|
__shared__ fp8_t s[max_lds_len];
|
||||||
|
|
||||||
for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
|
for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
|
||||||
k < min(K * N, 64 * 1024); k += THRDS * WvPrGrp * A_CHUNK) {
|
k < min(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
|
||||||
*((bigType*)(&s[k])) = *((bigType*)(&A[k]));
|
*((bigType*)(&s[k])) = *((bigType*)(&A[k]));
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
@@ -1517,7 +1543,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
|||||||
uint32_t k_ = k + threadIdx.x * A_CHUNK;
|
uint32_t k_ = k + threadIdx.x * A_CHUNK;
|
||||||
if (k_ >= K) break;
|
if (k_ >= K) break;
|
||||||
for (int n = 0; n < N; n++) {
|
for (int n = 0; n < N; n++) {
|
||||||
if (k_ + K * n < 64 * 1024)
|
if (k_ + K * n < max_lds_len)
|
||||||
bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
|
bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
|
||||||
else
|
else
|
||||||
bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n])));
|
bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n])));
|
||||||
@@ -1608,7 +1634,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
|||||||
m += CuCount * _WvPrGrp * YTILE;
|
m += CuCount * _WvPrGrp * YTILE;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else // !defined(__HIP__MI300__) TODO: Add NAVI support
|
#else // !defined(__HIP__MI3XX__) TODO: Add NAVI support
|
||||||
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
|
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
|
||||||
int A_CHUNK, int UNRL, int N>
|
int A_CHUNK, int UNRL, int N>
|
||||||
__global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M,
|
__global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M,
|
||||||
@@ -1618,7 +1644,7 @@ __global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M,
|
|||||||
const int CuCount) {
|
const int CuCount) {
|
||||||
UNREACHABLE_CODE
|
UNREACHABLE_CODE
|
||||||
}
|
}
|
||||||
#endif // defined(__HIP__MI300__) TODO: Add NAVI support
|
#endif // defined(__HIP__MI3XX__) TODO: Add NAVI support
|
||||||
|
|
||||||
void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
|
void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
|
||||||
at::Tensor& scale_a, at::Tensor& scale_b,
|
at::Tensor& scale_a, at::Tensor& scale_b,
|
||||||
@@ -1638,12 +1664,13 @@ void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
|
|||||||
dim3 grid(CuCount);
|
dim3 grid(CuCount);
|
||||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a));
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a));
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
const int max_lds_len = get_lds_size();
|
||||||
|
|
||||||
#define WVSPLITKQ(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \
|
#define WVSPLITKQ(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \
|
||||||
_N) \
|
_N) \
|
||||||
{ \
|
{ \
|
||||||
dim3 block(64, _WvPrGrp); \
|
dim3 block(64, _WvPrGrp); \
|
||||||
if ((K_in * N_in <= 64 * 1024) && (M_in % _YTILEs == 0)) { \
|
if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) { \
|
||||||
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp); \
|
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp); \
|
||||||
wvSplitKQ_hf_sml_<fptype, fp8_t, 64, _YTILEs, _WvPrGrp, 16, _UNRLs, _N> \
|
wvSplitKQ_hf_sml_<fptype, fp8_t, 64, _YTILEs, _WvPrGrp, 16, _UNRLs, _N> \
|
||||||
<<<grid, block, 0, stream>>>(K_in, Kp_in, M_in, a_ptr, b_ptr, c_ptr, \
|
<<<grid, block, 0, stream>>>(K_in, Kp_in, M_in, a_ptr, b_ptr, c_ptr, \
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user