Compare commits
193 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
fd47e57f4b | ||
|
|
203ab8f80f | ||
|
|
4141608c6a | ||
|
|
dfe43a2071 | ||
|
|
16b24e7dcd | ||
|
|
f519902c52 | ||
|
|
250e26a63e | ||
|
|
2b184ddd4f | ||
|
|
00298e092c | ||
|
|
89feb4c84d | ||
|
|
ec10cb8511 | ||
|
|
d11b46f3a5 | ||
|
|
c6cf9295e1 | ||
|
|
de9fb4bef8 | ||
|
|
8baf85e4e9 | ||
|
|
1a1823871d | ||
|
|
6cf1167c1a | ||
|
|
f710090d8e | ||
|
|
7342a7d7f8 | ||
|
|
df3dcdf49d | ||
|
|
36ea79079b | ||
|
|
e808156f30 | ||
|
|
cbc2ef5529 | ||
|
|
94bf9ae4e9 | ||
|
|
f990bab2a4 | ||
|
|
e00c094f15 | ||
|
|
a78c6ba7c8 | ||
|
|
fb870fd491 | ||
|
|
270953bafb | ||
|
|
9cc811c4ff | ||
|
|
e4d652ea3e | ||
|
|
78c0b4166c | ||
|
|
21efb603f5 | ||
|
|
055f3270d4 | ||
|
|
18511aeda6 | ||
|
|
83ea5c72b9 | ||
|
|
04de9057ab | ||
|
|
07c11cf4d4 | ||
|
|
f3a507f1d3 | ||
|
|
a64e7b9407 | ||
|
|
ce00231a8b | ||
|
|
de895f1697 | ||
|
|
cf25b93bdd | ||
|
|
d5fbb8706d | ||
|
|
cdca8994bd | ||
|
|
ca77dd7a44 | ||
|
|
7dea289066 | ||
|
|
cfaa6008e6 | ||
|
|
21906a6f50 | ||
|
|
dc4aea677a | ||
|
|
c8627cd41b | ||
|
|
8bfaa4e31e | ||
|
|
0b5b5d767e | ||
|
|
cdc72e3c80 | ||
|
|
7627172bf4 | ||
|
|
480b7f40cf | ||
|
|
acce7630c1 | ||
|
|
ffc4b27ea8 | ||
|
|
2f4117c38e | ||
|
|
9ba0bd6aa6 | ||
|
|
2a131965a8 | ||
|
|
bd37b9fbe2 | ||
|
|
de24046fcd | ||
|
|
1874c6a1b0 | ||
|
|
9a94ca4a5d | ||
|
|
cfba685bd4 | ||
|
|
069d3bd8d0 | ||
|
|
a3691b6b5e | ||
|
|
8c746226c9 | ||
|
|
e1faa2a598 | ||
|
|
80b57f00d5 | ||
|
|
04c12f8157 | ||
|
|
8eeb857084 | ||
|
|
fa45513a51 | ||
|
|
c0d9a98d0c | ||
|
|
e0dbdb013d | ||
|
|
93cf74a8a7 | ||
|
|
151ef4efd2 | ||
|
|
f19da64871 | ||
|
|
4f95ffee6f | ||
|
|
8c6de96ea1 | ||
|
|
18b296fdb2 | ||
|
|
c8f26bb636 | ||
|
|
487678d046 | ||
|
|
cb3b2b9ba4 | ||
|
|
fdf59d30ea | ||
|
|
b22b798471 | ||
|
|
f22619fe96 | ||
|
|
168cab6bbf | ||
|
|
23fea8714a | ||
|
|
f4dd830e09 | ||
|
|
5df1834895 | ||
|
|
cfadb9c687 | ||
|
|
15986f598c | ||
|
|
53b3a33027 | ||
|
|
dac914b0d6 | ||
|
|
a95354a36e | ||
|
|
663874e048 | ||
|
|
cc90419e89 | ||
|
|
27302dd584 | ||
|
|
0cc566ca8f | ||
|
|
05c531be47 | ||
|
|
fbb74420e7 | ||
|
|
05d686432f | ||
|
|
0dcc8cbe5a | ||
|
|
26aa325f4f | ||
|
|
e5dc713c23 | ||
|
|
36eecfbddb | ||
|
|
9ade8bbc8d | ||
|
|
22482e495e | ||
|
|
3d826d2c52 | ||
|
|
0e36fd4909 | ||
|
|
0f6d7a9a34 | ||
|
|
303d44790a | ||
|
|
aeb37c2a72 | ||
|
|
3dbb215b38 | ||
|
|
2838d6b38e | ||
|
|
91add85ec4 | ||
|
|
9aaf14c62e | ||
|
|
63e39937f9 | ||
|
|
f5d72b2fc6 | ||
|
|
83caf35e08 | ||
|
|
01843c89b8 | ||
|
|
19a4dd0990 | ||
|
|
18c2e30c57 | ||
|
|
19f0d25796 | ||
|
|
f58d4fccc9 | ||
|
|
afb050b29d | ||
|
|
7f60520deb | ||
|
|
563649aafe | ||
|
|
1570203864 | ||
|
|
22f5851b80 | ||
|
|
4f341bd4bf | ||
|
|
35bd215168 | ||
|
|
1fe0a4264a | ||
|
|
bc4eb65b54 | ||
|
|
82f3937e59 | ||
|
|
7da2487591 | ||
|
|
aaccca2b4d | ||
|
|
062c89e7c9 | ||
|
|
bce324487a | ||
|
|
1425a1bcf9 | ||
|
|
1cabfcefb6 | ||
|
|
be76e5aabf | ||
|
|
2ae25f79cf | ||
|
|
8e60afa15e | ||
|
|
b6d7392579 | ||
|
|
e01ab595d8 | ||
|
|
f13a07b1f8 | ||
|
|
6c9ba48fde | ||
|
|
1fb9c1b0bf | ||
|
|
31f46a0d35 | ||
|
|
3d49776bbb | ||
|
|
bc2ef1f77c | ||
|
|
2e7fe7e79f | ||
|
|
26a68d5d7e | ||
|
|
d081da0064 | ||
|
|
5bf8789b2a | ||
|
|
d1537039ce | ||
|
|
cc276443b5 | ||
|
|
e585b583a9 | ||
|
|
090e945e36 | ||
|
|
e1a3f5e831 | ||
|
|
19d02ff938 | ||
|
|
39d3f8d94f | ||
|
|
b0298aa8cc | ||
|
|
260024a374 | ||
|
|
d86f6b2afb | ||
|
|
bd429f2b75 | ||
|
|
18e60d7d13 | ||
|
|
c2ec430ab5 | ||
|
|
c5d55356f9 | ||
|
|
172d1cd276 | ||
|
|
a9b15c606f | ||
|
|
8df2dc3c88 | ||
|
|
6d792d2f31 | ||
|
|
0e088750af | ||
|
|
dc4e3df5c2 | ||
|
|
3b00b9c26c | ||
|
|
344cd2b6f4 | ||
|
|
1b49148e47 | ||
|
|
4b377d6feb | ||
|
|
71d21c73ab | ||
|
|
ee2da3e9ef | ||
|
|
e2f6f26e86 | ||
|
|
b28d2104de | ||
|
|
93d364da34 | ||
|
|
d9cfbc891e | ||
|
|
70de39f6b4 | ||
|
|
68988d4e0d | ||
|
|
520db4dbc1 | ||
|
|
f70bccac75 | ||
|
|
4bb98f2190 |
@@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
||||||
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.764
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.764
|
||||||
|
limit: 250
|
||||||
|
num_fewshot: 5
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
Meta-Llama-3-8B-Instruct.yaml
|
Meta-Llama-3-8B-Instruct.yaml
|
||||||
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
|
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
|
||||||
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
|
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
|
||||||
|
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
|
||||||
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
|
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
|
||||||
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
|
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
|
||||||
Minitron-4B-Base-FP8.yaml
|
Minitron-4B-Base-FP8.yaml
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
# We can use this script to compute baseline accuracy on GSM for transformers.
|
# We can use this script to compute baseline accuracy on GSM for transformers.
|
||||||
#
|
#
|
||||||
# Make sure you have lm-eval-harness installed:
|
# Make sure you have lm-eval-harness installed:
|
||||||
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10
|
# pip install lm-eval==0.4.4
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo``
|
echo``
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
# We use this for fp8, which HF does not support.
|
# We use this for fp8, which HF does not support.
|
||||||
#
|
#
|
||||||
# Make sure you have lm-eval-harness installed:
|
# Make sure you have lm-eval-harness installed:
|
||||||
# pip install lm-eval==0.4.3
|
# pip install lm-eval==0.4.4
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo``
|
echo``
|
||||||
|
|||||||
@@ -49,10 +49,15 @@ def test_lm_eval_correctness():
|
|||||||
results = launch_lm_eval(eval_config)
|
results = launch_lm_eval(eval_config)
|
||||||
|
|
||||||
# Confirm scores match ground truth.
|
# Confirm scores match ground truth.
|
||||||
|
success = True
|
||||||
for task in eval_config["tasks"]:
|
for task in eval_config["tasks"]:
|
||||||
for metric in task["metrics"]:
|
for metric in task["metrics"]:
|
||||||
ground_truth = metric["value"]
|
ground_truth = metric["value"]
|
||||||
measured_value = results["results"][task["name"]][metric["name"]]
|
measured_value = results["results"][task["name"]][metric["name"]]
|
||||||
print(f'{task["name"]} | {metric["name"]}: '
|
print(f'{task["name"]} | {metric["name"]}: '
|
||||||
f'ground_truth={ground_truth} | measured={measured_value}')
|
f'ground_truth={ground_truth} | measured={measured_value}')
|
||||||
assert numpy.isclose(ground_truth, measured_value, rtol=RTOL)
|
success = success and numpy.isclose(
|
||||||
|
ground_truth, measured_value, rtol=RTOL)
|
||||||
|
|
||||||
|
# Assert at the end, print all scores even on failure for debugging.
|
||||||
|
assert success
|
||||||
|
|||||||
28
.buildkite/nightly-benchmarks/nightly-annotation.md
Normal file
28
.buildkite/nightly-benchmarks/nightly-annotation.md
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
This file contains the downloading link for benchmarking results.
|
||||||
|
|
||||||
|
- [benchmarking pipeline](artifact://nightly-pipeline.yaml)
|
||||||
|
- [benchmarking results](artifact://results.zip)
|
||||||
|
- [benchmarking code](artifact://nightly-benchmarks.zip)
|
||||||
|
|
||||||
|
Please download the visualization scripts in the post
|
||||||
|
|
||||||
|
|
||||||
|
## Results reproduction
|
||||||
|
|
||||||
|
- Find the docker we use in `benchmarking pipeline`
|
||||||
|
- Deploy the docker, and inside the docker:
|
||||||
|
- Download `nightly-benchmarks.zip`.
|
||||||
|
- In the same folder, run the following code
|
||||||
|
```
|
||||||
|
export HF_TOKEN=<your HF token>
|
||||||
|
apt update
|
||||||
|
apt install -y git
|
||||||
|
unzip nightly-benchmarks.zip
|
||||||
|
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
And the results will be inside `./benchmarks/results`.
|
||||||
|
|
||||||
@@ -1,45 +1,39 @@
|
|||||||
|
|
||||||
# Nightly benchmark
|
# Nightly benchmark
|
||||||
|
|
||||||
The main goal of this benchmarking is two-fold:
|
This benchmark aims to:
|
||||||
- Performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and tgi) leads in performance in what workload.
|
- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
|
||||||
- Reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions in [reproduce.md]().
|
- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
|
||||||
|
|
||||||
|
Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
|
||||||
|
|
||||||
|
Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
|
||||||
|
|
||||||
|
|
||||||
## Docker images
|
## Setup
|
||||||
|
|
||||||
We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker images:
|
- Docker images:
|
||||||
- vllm/vllm-openai:v0.5.0.post1
|
- vLLM: `vllm/vllm-openai:v0.6.2`
|
||||||
- nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
|
- SGLang: `lmsysorg/sglang:v0.3.2-cu121`
|
||||||
- openmmlab/lmdeploy:v0.5.0
|
- LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
|
||||||
- ghcr.io/huggingface/text-generation-inference:2.1
|
- TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
|
||||||
|
- *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
|
||||||
|
- Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
|
||||||
|
- Hardware
|
||||||
|
- 8x Nvidia A100 GPUs
|
||||||
|
- Workload:
|
||||||
|
- Dataset
|
||||||
|
- ShareGPT dataset
|
||||||
|
- Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
|
||||||
|
- Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
|
||||||
|
- Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
|
||||||
|
- Models: llama-3 8B, llama-3 70B.
|
||||||
|
- We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
|
||||||
|
- Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
|
||||||
|
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
|
||||||
|
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
|
||||||
|
|
||||||
<!-- Please check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/nightly-pipeline.yaml">nightly-pipeline.yaml</a> artifact for more details on how we deploy the docker images. -->
|
# Known issues
|
||||||
|
|
||||||
|
- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
|
||||||
## Hardware
|
- TGI does not support `ignore-eos` flag.
|
||||||
|
|
||||||
One AWS node with 8x NVIDIA A100 GPUs.
|
|
||||||
|
|
||||||
|
|
||||||
## Workload description
|
|
||||||
|
|
||||||
We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
|
|
||||||
|
|
||||||
- Input length: randomly sample 500 prompts from ShareGPT dataset (with fixed random seed).
|
|
||||||
- Output length: the corresponding output length of these 500 prompts.
|
|
||||||
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
|
|
||||||
- Average QPS (query per second): 4 for the small model (llama-3 8B) and 2 for other two models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
|
|
||||||
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
|
|
||||||
|
|
||||||
<!-- Check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/tests/nightly-tests.json">nightly-tests.json</a> artifact for more details. -->
|
|
||||||
|
|
||||||
## Plots
|
|
||||||
|
|
||||||
In the following plots, the dot shows the mean and the error bar shows the standard error of the mean. Value 0 means that the corresponding benchmark crashed.
|
|
||||||
|
|
||||||
<img src="artifact://nightly_results.png" alt="Benchmarking results" height=250 >
|
|
||||||
|
|
||||||
## Results
|
|
||||||
|
|
||||||
{nightly_results_benchmarking_table}
|
|
||||||
@@ -13,7 +13,7 @@ common_pod_spec: &common_pod_spec
|
|||||||
|
|
||||||
common_container_settings: &common_container_settings
|
common_container_settings: &common_container_settings
|
||||||
command:
|
command:
|
||||||
- bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
|
- bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
||||||
resources:
|
resources:
|
||||||
limits:
|
limits:
|
||||||
nvidia.com/gpu: 8
|
nvidia.com/gpu: 8
|
||||||
@@ -37,7 +37,10 @@ common_container_settings: &common_container_settings
|
|||||||
|
|
||||||
steps:
|
steps:
|
||||||
- block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
|
- block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
|
||||||
- label: "A100 trt benchmark"
|
|
||||||
|
|
||||||
|
|
||||||
|
- label: "A100 vllm step 10"
|
||||||
priority: 100
|
priority: 100
|
||||||
agents:
|
agents:
|
||||||
queue: A100
|
queue: A100
|
||||||
@@ -46,7 +49,21 @@ steps:
|
|||||||
podSpec:
|
podSpec:
|
||||||
<<: *common_pod_spec
|
<<: *common_pod_spec
|
||||||
containers:
|
containers:
|
||||||
- image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
|
- image: vllm/vllm-openai:v0.6.2
|
||||||
|
<<: *common_container_settings
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
- label: "A100 sglang benchmark"
|
||||||
|
priority: 100
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
<<: *common_pod_spec
|
||||||
|
containers:
|
||||||
|
- image: lmsysorg/sglang:v0.3.2-cu121
|
||||||
<<: *common_container_settings
|
<<: *common_container_settings
|
||||||
|
|
||||||
- label: "A100 lmdeploy benchmark"
|
- label: "A100 lmdeploy benchmark"
|
||||||
@@ -58,11 +75,13 @@ steps:
|
|||||||
podSpec:
|
podSpec:
|
||||||
<<: *common_pod_spec
|
<<: *common_pod_spec
|
||||||
containers:
|
containers:
|
||||||
- image: openmmlab/lmdeploy:v0.5.0
|
- image: openmmlab/lmdeploy:v0.6.1-cu12
|
||||||
<<: *common_container_settings
|
<<: *common_container_settings
|
||||||
|
|
||||||
|
|
||||||
- label: "A100 vllm benchmark"
|
|
||||||
|
|
||||||
|
- label: "A100 trt llama-8B"
|
||||||
priority: 100
|
priority: 100
|
||||||
agents:
|
agents:
|
||||||
queue: A100
|
queue: A100
|
||||||
@@ -71,10 +90,25 @@ steps:
|
|||||||
podSpec:
|
podSpec:
|
||||||
<<: *common_pod_spec
|
<<: *common_pod_spec
|
||||||
containers:
|
containers:
|
||||||
- image: vllm/vllm-openai:latest
|
- image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
|
||||||
<<: *common_container_settings
|
<<: *common_container_settings
|
||||||
|
env:
|
||||||
|
- name: VLLM_USAGE_SOURCE
|
||||||
|
value: ci-test
|
||||||
|
- name: HF_HOME
|
||||||
|
value: /root/.cache/huggingface
|
||||||
|
- name: VLLM_SOURCE_CODE_LOC
|
||||||
|
value: /workspace/build/buildkite/vllm/performance-benchmark
|
||||||
|
- name: HF_TOKEN
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: hf-token-secret
|
||||||
|
key: token
|
||||||
|
- name: TEST_SELECTOR
|
||||||
|
value: "llama8B"
|
||||||
|
|
||||||
- label: "A100 tgi benchmark"
|
|
||||||
|
- label: "A100 trt llama-70B"
|
||||||
priority: 100
|
priority: 100
|
||||||
agents:
|
agents:
|
||||||
queue: A100
|
queue: A100
|
||||||
@@ -83,12 +117,54 @@ steps:
|
|||||||
podSpec:
|
podSpec:
|
||||||
<<: *common_pod_spec
|
<<: *common_pod_spec
|
||||||
containers:
|
containers:
|
||||||
- image: ghcr.io/huggingface/text-generation-inference:2.1
|
- image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
|
||||||
<<: *common_container_settings
|
<<: *common_container_settings
|
||||||
|
env:
|
||||||
|
- name: VLLM_USAGE_SOURCE
|
||||||
|
value: ci-test
|
||||||
|
- name: HF_HOME
|
||||||
|
value: /root/.cache/huggingface
|
||||||
|
- name: VLLM_SOURCE_CODE_LOC
|
||||||
|
value: /workspace/build/buildkite/vllm/performance-benchmark
|
||||||
|
- name: HF_TOKEN
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: hf-token-secret
|
||||||
|
key: token
|
||||||
|
- name: TEST_SELECTOR
|
||||||
|
value: "llama70B"
|
||||||
|
|
||||||
|
|
||||||
|
# FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image
|
||||||
|
# - label: "A100 trt benchmark"
|
||||||
|
# priority: 100
|
||||||
|
# agents:
|
||||||
|
# queue: A100
|
||||||
|
# plugins:
|
||||||
|
# - kubernetes:
|
||||||
|
# podSpec:
|
||||||
|
# <<: *common_pod_spec
|
||||||
|
# containers:
|
||||||
|
# - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
|
||||||
|
# <<: *common_container_settings
|
||||||
|
|
||||||
|
|
||||||
|
# FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`.
|
||||||
|
# - label: "A100 tgi benchmark"
|
||||||
|
# priority: 100
|
||||||
|
# agents:
|
||||||
|
# queue: A100
|
||||||
|
# plugins:
|
||||||
|
# - kubernetes:
|
||||||
|
# podSpec:
|
||||||
|
# <<: *common_pod_spec
|
||||||
|
# containers:
|
||||||
|
# - image: ghcr.io/huggingface/text-generation-inference:2.2.0
|
||||||
|
# <<: *common_container_settings
|
||||||
|
|
||||||
- wait
|
- wait
|
||||||
|
|
||||||
- label: "Plot"
|
- label: "Collect the results"
|
||||||
priority: 100
|
priority: 100
|
||||||
agents:
|
agents:
|
||||||
queue: A100
|
queue: A100
|
||||||
@@ -117,4 +193,4 @@ steps:
|
|||||||
name: hf-token-secret
|
name: hf-token-secret
|
||||||
key: token
|
key: token
|
||||||
|
|
||||||
- wait
|
- block: ":rocket: check the results!"
|
||||||
@@ -1,76 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -o pipefail
|
|
||||||
set -x
|
|
||||||
|
|
||||||
check_gpus() {
|
|
||||||
# check the number of GPUs and GPU type.
|
|
||||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
|
||||||
if [[ $gpu_count -gt 0 ]]; then
|
|
||||||
echo "GPU found."
|
|
||||||
else
|
|
||||||
echo "Need at least 1 GPU to run benchmarking."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
|
||||||
echo "GPU type is $gpu_type"
|
|
||||||
}
|
|
||||||
|
|
||||||
check_hf_token() {
|
|
||||||
# check if HF_TOKEN is available and valid
|
|
||||||
if [[ -z "$HF_TOKEN" ]]; then
|
|
||||||
echo "Error: HF_TOKEN is not set."
|
|
||||||
exit 1
|
|
||||||
elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
|
|
||||||
echo "Error: HF_TOKEN does not start with 'hf_'."
|
|
||||||
exit 1
|
|
||||||
else
|
|
||||||
echo "HF_TOKEN is set and valid."
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
main() {
|
|
||||||
|
|
||||||
check_gpus
|
|
||||||
check_hf_token
|
|
||||||
|
|
||||||
df -h
|
|
||||||
|
|
||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
|
||||||
(which jq) || (apt-get update && apt-get -y install jq)
|
|
||||||
|
|
||||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
|
||||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
|
||||||
|
|
||||||
|
|
||||||
# run lmdeploy
|
|
||||||
if which lmdeploy >/dev/null; then
|
|
||||||
echo "lmdeploy is available, redirect to run-lmdeploy-nightly.sh"
|
|
||||||
bash ../.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
# run tgi
|
|
||||||
if [ -e /tgi-entrypoint.sh ]; then
|
|
||||||
echo "tgi is available, redirect to run-tgi-nightly.sh"
|
|
||||||
bash ../.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
# run trt
|
|
||||||
if which trtllm-build >/dev/null; then
|
|
||||||
echo "trtllm is available, redirect to run-trt-nightly.sh"
|
|
||||||
bash ../.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
# run vllm
|
|
||||||
if [ -e /vllm-workspace ]; then
|
|
||||||
echo "vllm is available, redirect to run-vllm-nightly.sh"
|
|
||||||
bash ../.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
main "$@"
|
|
||||||
@@ -0,0 +1,95 @@
|
|||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from tabulate import tabulate
|
||||||
|
|
||||||
|
|
||||||
|
def parse_arguments():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description=
|
||||||
|
'Parse command line arguments for summary-nightly-results script.')
|
||||||
|
parser.add_argument('--results-folder',
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help='The folder where the results are stored.')
|
||||||
|
parser.add_argument('--description',
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help='Description of the results.')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def get_perf(df, method, model, metric):
|
||||||
|
|
||||||
|
means = []
|
||||||
|
|
||||||
|
for qps in [2, 4, 8, 16, "inf"]:
|
||||||
|
target = df['Test name'].str.contains(model)
|
||||||
|
target = target & df['Engine'].str.contains(method)
|
||||||
|
target = target & df['Test name'].str.contains("qps_" + str(qps))
|
||||||
|
filtered_df = df[target]
|
||||||
|
|
||||||
|
if filtered_df.empty:
|
||||||
|
means.append(0.)
|
||||||
|
else:
|
||||||
|
means.append(filtered_df[metric].values[0])
|
||||||
|
|
||||||
|
return np.array(means)
|
||||||
|
|
||||||
|
|
||||||
|
def get_perf_w_std(df, method, model, metric):
|
||||||
|
|
||||||
|
if metric in ["TTFT", "ITL"]:
|
||||||
|
mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
|
||||||
|
mean = mean.tolist()
|
||||||
|
std = get_perf(df, method, model, "Std " + metric + " (ms)")
|
||||||
|
if std.mean() == 0:
|
||||||
|
std = None
|
||||||
|
success = get_perf(df, method, model, "Successful req.")
|
||||||
|
if std is not None:
|
||||||
|
std = std / np.sqrt(success)
|
||||||
|
std = std.tolist()
|
||||||
|
|
||||||
|
else:
|
||||||
|
assert metric == "Tput"
|
||||||
|
mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
|
||||||
|
df, method, model, "Output Tput (tok/s)")
|
||||||
|
mean = mean.tolist()
|
||||||
|
std = None
|
||||||
|
|
||||||
|
return mean, std
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
results_folder = Path(args.results_folder)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
|
||||||
|
# collect results
|
||||||
|
for test_file in results_folder.glob("*_nightly_results.json"):
|
||||||
|
with open(test_file, "r") as f:
|
||||||
|
results = results + json.loads(f.read())
|
||||||
|
|
||||||
|
# generate markdown table
|
||||||
|
df = pd.DataFrame.from_dict(results)
|
||||||
|
|
||||||
|
md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
|
||||||
|
|
||||||
|
with open(args.description, "r") as f:
|
||||||
|
description = f.read()
|
||||||
|
|
||||||
|
description = description.format(
|
||||||
|
nightly_results_benchmarking_table=md_table)
|
||||||
|
|
||||||
|
with open("nightly_results.md", "w") as f:
|
||||||
|
f.write(description)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
args = parse_arguments()
|
||||||
|
main(args)
|
||||||
241
.buildkite/nightly-benchmarks/scripts/launch-server.sh
Normal file
241
.buildkite/nightly-benchmarks/scripts/launch-server.sh
Normal file
@@ -0,0 +1,241 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Currently FP8 benchmark is NOT enabled.
|
||||||
|
|
||||||
|
set -x
|
||||||
|
server_params=$1
|
||||||
|
common_params=$2
|
||||||
|
|
||||||
|
json2args() {
|
||||||
|
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
||||||
|
# example:
|
||||||
|
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
||||||
|
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
||||||
|
local json_string=$1
|
||||||
|
local args=$(
|
||||||
|
echo "$json_string" | jq -r '
|
||||||
|
to_entries |
|
||||||
|
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
||||||
|
join(" ")
|
||||||
|
'
|
||||||
|
)
|
||||||
|
echo "$args"
|
||||||
|
}
|
||||||
|
|
||||||
|
launch_trt_server() {
|
||||||
|
|
||||||
|
model_path=$(echo "$common_params" | jq -r '.model')
|
||||||
|
model_name="${model_path#*/}"
|
||||||
|
model_type=$(echo "$server_params" | jq -r '.model_type')
|
||||||
|
model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
|
||||||
|
model_tp_size=$(echo "$common_params" | jq -r '.tp')
|
||||||
|
max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
|
||||||
|
max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
|
||||||
|
max_seq_len=$(echo "$server_params" | jq -r '.max_seq_len')
|
||||||
|
max_num_tokens=$(echo "$server_params" | jq -r '.max_num_tokens')
|
||||||
|
trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
|
||||||
|
|
||||||
|
# create model caching directory
|
||||||
|
cd ~
|
||||||
|
rm -rf models
|
||||||
|
mkdir -p models
|
||||||
|
cd models
|
||||||
|
models_dir=$(pwd)
|
||||||
|
trt_model_path=${models_dir}/${model_name}-trt-ckpt
|
||||||
|
trt_engine_path=${models_dir}/${model_name}-trt-engine
|
||||||
|
|
||||||
|
# clone tensorrt backend
|
||||||
|
cd /
|
||||||
|
rm -rf tensorrtllm_backend
|
||||||
|
git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
|
||||||
|
git lfs install
|
||||||
|
cd tensorrtllm_backend
|
||||||
|
git checkout $trt_llm_version
|
||||||
|
tensorrtllm_backend_dir=$(pwd)
|
||||||
|
git submodule update --init --recursive
|
||||||
|
|
||||||
|
# build trtllm engine
|
||||||
|
cd /tensorrtllm_backend
|
||||||
|
cd ./tensorrt_llm/examples/${model_type}
|
||||||
|
python3 convert_checkpoint.py \
|
||||||
|
--model_dir ${model_path} \
|
||||||
|
--dtype ${model_dtype} \
|
||||||
|
--tp_size ${model_tp_size} \
|
||||||
|
--output_dir ${trt_model_path}
|
||||||
|
trtllm-build \
|
||||||
|
--checkpoint_dir ${trt_model_path} \
|
||||||
|
--use_fused_mlp \
|
||||||
|
--reduce_fusion disable \
|
||||||
|
--workers 8 \
|
||||||
|
--gpt_attention_plugin ${model_dtype} \
|
||||||
|
--gemm_plugin ${model_dtype} \
|
||||||
|
--tp_size ${model_tp_size} \
|
||||||
|
--max_batch_size ${max_batch_size} \
|
||||||
|
--max_input_len ${max_input_len} \
|
||||||
|
--max_seq_len ${max_seq_len} \
|
||||||
|
--max_num_tokens ${max_num_tokens} \
|
||||||
|
--output_dir ${trt_engine_path}
|
||||||
|
|
||||||
|
# handle triton protobuf files and launch triton server
|
||||||
|
cd /tensorrtllm_backend
|
||||||
|
mkdir triton_model_repo
|
||||||
|
cp -r all_models/inflight_batcher_llm/* triton_model_repo/
|
||||||
|
cd triton_model_repo
|
||||||
|
rm -rf ./tensorrt_llm/1/*
|
||||||
|
cp -r ${trt_engine_path}/* ./tensorrt_llm/1
|
||||||
|
python3 ../tools/fill_template.py -i tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,engine_dir:/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1,decoupled_mode:true,batching_strategy:inflight_fused_batching,batch_scheduler_policy:guaranteed_no_evict,exclude_input_in_output:true,triton_max_batch_size:2048,max_queue_delay_microseconds:0,max_beam_width:1,max_queue_size:2048,enable_kv_cache_reuse:false
|
||||||
|
python3 ../tools/fill_template.py -i preprocessing/config.pbtxt triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5
|
||||||
|
python3 ../tools/fill_template.py -i postprocessing/config.pbtxt triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false
|
||||||
|
python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:$max_batch_size
|
||||||
|
python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:"False",bls_instance_count:1
|
||||||
|
cd /tensorrtllm_backend
|
||||||
|
python3 scripts/launch_triton_server.py \
|
||||||
|
--world_size=${model_tp_size} \
|
||||||
|
--model_repo=/tensorrtllm_backend/triton_model_repo &
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
launch_tgi_server() {
|
||||||
|
model=$(echo "$common_params" | jq -r '.model')
|
||||||
|
tp=$(echo "$common_params" | jq -r '.tp')
|
||||||
|
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||||
|
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||||
|
port=$(echo "$common_params" | jq -r '.port')
|
||||||
|
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||||
|
server_args=$(json2args "$server_params")
|
||||||
|
|
||||||
|
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
||||||
|
echo "Key 'fp8' exists in common params."
|
||||||
|
server_command="/tgi-entrypoint.sh \
|
||||||
|
--model-id $model \
|
||||||
|
--num-shard $tp \
|
||||||
|
--port $port \
|
||||||
|
--quantize fp8 \
|
||||||
|
$server_args"
|
||||||
|
else
|
||||||
|
echo "Key 'fp8' does not exist in common params."
|
||||||
|
server_command="/tgi-entrypoint.sh \
|
||||||
|
--model-id $model \
|
||||||
|
--num-shard $tp \
|
||||||
|
--port $port \
|
||||||
|
$server_args"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Server command: $server_command"
|
||||||
|
eval "$server_command" &
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
launch_lmdeploy_server() {
|
||||||
|
model=$(echo "$common_params" | jq -r '.model')
|
||||||
|
tp=$(echo "$common_params" | jq -r '.tp')
|
||||||
|
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||||
|
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||||
|
port=$(echo "$common_params" | jq -r '.port')
|
||||||
|
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||||
|
server_args=$(json2args "$server_params")
|
||||||
|
|
||||||
|
server_command="lmdeploy serve api_server $model \
|
||||||
|
--tp $tp \
|
||||||
|
--server-port $port \
|
||||||
|
$server_args"
|
||||||
|
|
||||||
|
# run the server
|
||||||
|
echo "Server command: $server_command"
|
||||||
|
bash -c "$server_command" &
|
||||||
|
}
|
||||||
|
|
||||||
|
launch_sglang_server() {
|
||||||
|
|
||||||
|
model=$(echo "$common_params" | jq -r '.model')
|
||||||
|
tp=$(echo "$common_params" | jq -r '.tp')
|
||||||
|
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||||
|
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||||
|
port=$(echo "$common_params" | jq -r '.port')
|
||||||
|
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||||
|
server_args=$(json2args "$server_params")
|
||||||
|
|
||||||
|
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
||||||
|
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
||||||
|
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
||||||
|
server_command="python3 \
|
||||||
|
-m sglang.launch_server \
|
||||||
|
--tp $tp \
|
||||||
|
--model-path $model \
|
||||||
|
--port $port \
|
||||||
|
$server_args"
|
||||||
|
else
|
||||||
|
echo "Key 'fp8' does not exist in common params."
|
||||||
|
server_command="python3 \
|
||||||
|
-m sglang.launch_server \
|
||||||
|
--tp $tp \
|
||||||
|
--model-path $model \
|
||||||
|
--port $port \
|
||||||
|
$server_args"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# run the server
|
||||||
|
echo "Server command: $server_command"
|
||||||
|
eval "$server_command" &
|
||||||
|
}
|
||||||
|
|
||||||
|
launch_vllm_server() {
|
||||||
|
|
||||||
|
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
||||||
|
|
||||||
|
model=$(echo "$common_params" | jq -r '.model')
|
||||||
|
tp=$(echo "$common_params" | jq -r '.tp')
|
||||||
|
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||||
|
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||||
|
port=$(echo "$common_params" | jq -r '.port')
|
||||||
|
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||||
|
server_args=$(json2args "$server_params")
|
||||||
|
|
||||||
|
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
||||||
|
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
||||||
|
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
||||||
|
server_command="python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
-tp $tp \
|
||||||
|
--model $model \
|
||||||
|
--port $port \
|
||||||
|
$server_args"
|
||||||
|
else
|
||||||
|
echo "Key 'fp8' does not exist in common params."
|
||||||
|
server_command="python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
-tp $tp \
|
||||||
|
--model $model \
|
||||||
|
--port $port \
|
||||||
|
$server_args"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# run the server
|
||||||
|
echo "Server command: $server_command"
|
||||||
|
eval "$server_command" &
|
||||||
|
}
|
||||||
|
|
||||||
|
main() {
|
||||||
|
|
||||||
|
if [[ $CURRENT_LLM_SERVING_ENGINE == "trt" ]]; then
|
||||||
|
launch_trt_server
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $CURRENT_LLM_SERVING_ENGINE == "tgi" ]]; then
|
||||||
|
launch_tgi_server
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $CURRENT_LLM_SERVING_ENGINE == "lmdeploy" ]]; then
|
||||||
|
launch_lmdeploy_server
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $CURRENT_LLM_SERVING_ENGINE == "sglang" ]]; then
|
||||||
|
launch_sglang_server
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$CURRENT_LLM_SERVING_ENGINE" == *"vllm"* ]]; then
|
||||||
|
launch_vllm_server
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
main
|
||||||
@@ -1,102 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
|
|
||||||
server_params=$1
|
|
||||||
common_params=$2
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
model_path=$(echo "$common_params" | jq -r '.model')
|
|
||||||
model_name="${model_path#*/}"
|
|
||||||
model_type=$(echo "$server_params" | jq -r '.model_type')
|
|
||||||
model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
|
|
||||||
model_tp_size=$(echo "$common_params" | jq -r '.tp')
|
|
||||||
max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
|
|
||||||
max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
|
|
||||||
max_output_len=$(echo "$server_params" | jq -r '.max_output_len')
|
|
||||||
trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
|
|
||||||
|
|
||||||
cd ~
|
|
||||||
rm -rf models
|
|
||||||
mkdir -p models
|
|
||||||
cd models
|
|
||||||
models_dir=$(pwd)
|
|
||||||
trt_model_path=${models_dir}/${model_name}-trt-ckpt
|
|
||||||
trt_engine_path=${models_dir}/${model_name}-trt-engine
|
|
||||||
|
|
||||||
cd ~
|
|
||||||
rm -rf tensorrt-demo
|
|
||||||
git clone https://github.com/neuralmagic/tensorrt-demo.git
|
|
||||||
cd tensorrt-demo
|
|
||||||
tensorrt_demo_dir=$(pwd)
|
|
||||||
|
|
||||||
# make sure the parameter inside tensorrt_demo is consistent to envvar
|
|
||||||
sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/postprocessing/config.pbtxt
|
|
||||||
sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/preprocessing/config.pbtxt
|
|
||||||
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/ensemble/config.pbtxt
|
|
||||||
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/preprocessing/config.pbtxt
|
|
||||||
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/postprocessing/config.pbtxt
|
|
||||||
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/tensorrt_llm_bls/config.pbtxt
|
|
||||||
|
|
||||||
|
|
||||||
cd /
|
|
||||||
rm -rf tensorrtllm_backend
|
|
||||||
git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
|
|
||||||
git lfs install
|
|
||||||
cd tensorrtllm_backend
|
|
||||||
git checkout $trt_llm_version
|
|
||||||
tensorrtllm_backend_dir=$(pwd)
|
|
||||||
git submodule update --init --recursive
|
|
||||||
cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/
|
|
||||||
|
|
||||||
cd /tensorrtllm_backend
|
|
||||||
cd ./tensorrt_llm/examples/${model_type}
|
|
||||||
|
|
||||||
|
|
||||||
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
|
|
||||||
|
|
||||||
echo "Key 'fp8' exists in common params. Use quantize.py instead of convert_checkpoint.py"
|
|
||||||
echo "Reference: https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llama/README.md"
|
|
||||||
python ../quantization/quantize.py \
|
|
||||||
--model_dir ${model_path} \
|
|
||||||
--dtype ${model_dtype} \
|
|
||||||
--tp_size ${model_tp_size} \
|
|
||||||
--output_dir ${trt_model_path} \
|
|
||||||
--qformat fp8 \
|
|
||||||
--kv_cache_dtype fp8 \
|
|
||||||
--calib_size 2
|
|
||||||
|
|
||||||
else
|
|
||||||
|
|
||||||
echo "Key 'fp8' does not exist in common params. Use convert_checkpoint.py"
|
|
||||||
python3 convert_checkpoint.py \
|
|
||||||
--model_dir ${model_path} \
|
|
||||||
--dtype ${model_dtype} \
|
|
||||||
--tp_size ${model_tp_size} \
|
|
||||||
--output_dir ${trt_model_path}
|
|
||||||
|
|
||||||
fi
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
trtllm-build \
|
|
||||||
--checkpoint_dir=${trt_model_path} \
|
|
||||||
--gpt_attention_plugin=${model_dtype} \
|
|
||||||
--gemm_plugin=${model_dtype} \
|
|
||||||
--remove_input_padding=enable \
|
|
||||||
--paged_kv_cache=enable \
|
|
||||||
--tp_size=${model_tp_size} \
|
|
||||||
--max_batch_size=${max_batch_size} \
|
|
||||||
--max_input_len=${max_input_len} \
|
|
||||||
--max_output_len=${max_output_len} \
|
|
||||||
--max_num_tokens=${max_output_len} \
|
|
||||||
--opt_num_tokens=${max_output_len} \
|
|
||||||
--output_dir=${trt_engine_path}
|
|
||||||
|
|
||||||
cd /tensorrtllm_backend/triton_model_repo
|
|
||||||
rm -rf ./tensorrt_llm/1/*
|
|
||||||
cp -r ${trt_engine_path}/* ./tensorrt_llm/1
|
|
||||||
cd /tensorrtllm_backend
|
|
||||||
python3 scripts/launch_triton_server.py \
|
|
||||||
--world_size=${model_tp_size} \
|
|
||||||
--model_repo=/tensorrtllm_backend/triton_model_repo &
|
|
||||||
@@ -8,6 +8,7 @@ main() {
|
|||||||
|
|
||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
(which jq) || (apt-get update && apt-get -y install jq)
|
(which jq) || (apt-get update && apt-get -y install jq)
|
||||||
|
(which zip) || (apt-get install -y zip)
|
||||||
|
|
||||||
if [ ! -f /workspace/buildkite-agent ]; then
|
if [ ! -f /workspace/buildkite-agent ]; then
|
||||||
echo "buildkite-agent binary not found. Skip plotting the results."
|
echo "buildkite-agent binary not found. Skip plotting the results."
|
||||||
@@ -24,17 +25,54 @@ main() {
|
|||||||
ls
|
ls
|
||||||
ls results/
|
ls results/
|
||||||
|
|
||||||
# generate figures
|
# upload benchmark results
|
||||||
python3 -m pip install tabulate pandas matplotlib
|
zip -r results.zip results/
|
||||||
python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
|
/workspace/buildkite-agent artifact upload "results.zip"
|
||||||
--description $description \
|
|
||||||
--results-folder results/
|
|
||||||
|
|
||||||
# upload results and figures
|
# upload benchmarking scripts
|
||||||
/workspace/buildkite-agent artifact upload "nightly_results.png"
|
cd $VLLM_SOURCE_CODE_LOC/
|
||||||
/workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
|
zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
|
||||||
/workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
|
/workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
|
||||||
/workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
|
|
||||||
|
cd $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/
|
||||||
|
# upload benchmarking pipeline
|
||||||
|
/workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
|
||||||
|
|
||||||
|
cd $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/
|
||||||
|
/workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# The figures should be genereated by a separate process outside the CI/CD pipeline
|
||||||
|
|
||||||
|
# # generate figures
|
||||||
|
# python3 -m pip install tabulate pandas matplotlib
|
||||||
|
|
||||||
|
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \
|
||||||
|
# --description $description \
|
||||||
|
# --results-folder results/
|
||||||
|
|
||||||
|
|
||||||
|
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
|
||||||
|
# --description $description \
|
||||||
|
# --results-folder results/ \
|
||||||
|
# --dataset sharegpt
|
||||||
|
|
||||||
|
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
|
||||||
|
# --description $description \
|
||||||
|
# --results-folder results/ \
|
||||||
|
# --dataset sonnet_2048_128
|
||||||
|
|
||||||
|
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
|
||||||
|
# --description $description \
|
||||||
|
# --results-folder results/ \
|
||||||
|
# --dataset sonnet_128_2048
|
||||||
|
|
||||||
|
# # upload results and figures
|
||||||
|
# /workspace/buildkite-agent artifact upload "nightly_results*.png"
|
||||||
|
# /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
|
||||||
|
# /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
|
||||||
|
# /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
|
||||||
}
|
}
|
||||||
|
|
||||||
main "$@"
|
main "$@"
|
||||||
@@ -1,135 +0,0 @@
|
|||||||
import argparse
|
|
||||||
import json
|
|
||||||
import math
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
import pandas as pd
|
|
||||||
from tabulate import tabulate
|
|
||||||
|
|
||||||
|
|
||||||
def parse_arguments():
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description=
|
|
||||||
'Parse command line arguments for summary-nightly-results script.')
|
|
||||||
parser.add_argument('--results-folder',
|
|
||||||
type=str,
|
|
||||||
required=True,
|
|
||||||
help='The folder where the results are stored.')
|
|
||||||
parser.add_argument('--description',
|
|
||||||
type=str,
|
|
||||||
required=True,
|
|
||||||
help='Description of the results.')
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
return args
|
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
|
||||||
bar_colors = ['#56B4E9', '#009E73', '#D55E00', '#E69F00']
|
|
||||||
results_folder = Path(args.results_folder)
|
|
||||||
|
|
||||||
results = []
|
|
||||||
|
|
||||||
# collect results
|
|
||||||
for test_file in results_folder.glob("*_nightly_results.json"):
|
|
||||||
with open(test_file, "r") as f:
|
|
||||||
results = results + json.loads(f.read())
|
|
||||||
|
|
||||||
# generate markdown table
|
|
||||||
df = pd.DataFrame.from_dict(results)
|
|
||||||
|
|
||||||
md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
|
|
||||||
|
|
||||||
with open(args.description, "r") as f:
|
|
||||||
description = f.read()
|
|
||||||
|
|
||||||
description = description.format(
|
|
||||||
nightly_results_benchmarking_table=md_table)
|
|
||||||
|
|
||||||
with open("nightly_results.md", "w") as f:
|
|
||||||
f.write(description)
|
|
||||||
|
|
||||||
plt.rcParams.update({'font.size': 20})
|
|
||||||
|
|
||||||
# plot results
|
|
||||||
fig, axes = plt.subplots(3, 3, figsize=(16, 14))
|
|
||||||
fig.subplots_adjust(hspace=1)
|
|
||||||
methods = ["vllm", "trt", "lmdeploy", "tgi"]
|
|
||||||
for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]):
|
|
||||||
for j, metric in enumerate(["TTFT", "ITL"]):
|
|
||||||
means, stds = [], []
|
|
||||||
for method in methods:
|
|
||||||
target = df['Test name'].str.contains(model)
|
|
||||||
target = target & df['Engine'].str.contains(method)
|
|
||||||
filtered_df = df[target]
|
|
||||||
|
|
||||||
if filtered_df.empty:
|
|
||||||
means.append(0.)
|
|
||||||
stds.append(0.)
|
|
||||||
else:
|
|
||||||
means.append(filtered_df[f"Mean {metric} (ms)"].values[0])
|
|
||||||
std = filtered_df[f"Std {metric} (ms)"].values[0]
|
|
||||||
success = filtered_df["Successful req."].values[0]
|
|
||||||
stds.append(std / math.sqrt(success))
|
|
||||||
|
|
||||||
print(model, metric)
|
|
||||||
print(means, stds)
|
|
||||||
|
|
||||||
ax = axes[i, j + 1]
|
|
||||||
|
|
||||||
bars = ax.bar(
|
|
||||||
["vllm", "trt", "lmdeploy", "tgi"],
|
|
||||||
means,
|
|
||||||
yerr=stds,
|
|
||||||
capsize=10,
|
|
||||||
)
|
|
||||||
for idx, bar in enumerate(bars):
|
|
||||||
bar.set_color(bar_colors[idx])
|
|
||||||
ax.set_ylim(bottom=0)
|
|
||||||
|
|
||||||
ax.set_ylabel(f"{metric} (ms)")
|
|
||||||
ax.set_title(f"{model} {metric}")
|
|
||||||
ax.grid(axis='y')
|
|
||||||
|
|
||||||
metric = "Tput"
|
|
||||||
j = 0
|
|
||||||
if True:
|
|
||||||
tputs = []
|
|
||||||
for method in methods:
|
|
||||||
target = df['Test name'].str.contains(model)
|
|
||||||
target = target & df['Engine'].str.contains(method)
|
|
||||||
filtered_df = df[target]
|
|
||||||
|
|
||||||
if filtered_df.empty:
|
|
||||||
tputs.append(0.)
|
|
||||||
else:
|
|
||||||
input_tput = filtered_df["Input Tput (tok/s)"].values[0]
|
|
||||||
output_tput = filtered_df["Output Tput (tok/s)"].values[0]
|
|
||||||
tputs.append(input_tput + output_tput)
|
|
||||||
|
|
||||||
print(model, metric)
|
|
||||||
print(tputs)
|
|
||||||
|
|
||||||
ax = axes[i, j]
|
|
||||||
|
|
||||||
bars = ax.bar(
|
|
||||||
["vllm", "trt", "lmdeploy", "tgi"],
|
|
||||||
tputs,
|
|
||||||
)
|
|
||||||
for idx, bar in enumerate(bars):
|
|
||||||
bar.set_color(bar_colors[idx])
|
|
||||||
|
|
||||||
ax.set_ylim(bottom=0)
|
|
||||||
|
|
||||||
ax.set_ylabel("Tput (token/s)")
|
|
||||||
ax.set_title(f"{model} {metric}")
|
|
||||||
ax.grid(axis='y')
|
|
||||||
|
|
||||||
fig.tight_layout()
|
|
||||||
fig.savefig("nightly_results.png", bbox_inches='tight', dpi=400)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
args = parse_arguments()
|
|
||||||
main(args)
|
|
||||||
@@ -1,218 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -o pipefail
|
|
||||||
|
|
||||||
check_gpus() {
|
|
||||||
# check the number of GPUs and GPU type.
|
|
||||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
|
||||||
if [[ $gpu_count -gt 0 ]]; then
|
|
||||||
echo "GPU found."
|
|
||||||
else
|
|
||||||
echo "Need at least 1 GPU to run benchmarking."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
|
||||||
echo "GPU type is $gpu_type"
|
|
||||||
}
|
|
||||||
|
|
||||||
kill_gpu_processes() {
|
|
||||||
pkill lmdeploy || true
|
|
||||||
# waiting for GPU processes to be fully killed
|
|
||||||
sleep 10
|
|
||||||
# Print the GPU memory usage
|
|
||||||
# so that we know if all GPU processes are killed.
|
|
||||||
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
|
|
||||||
# The memory usage should be 0 MB.
|
|
||||||
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
|
|
||||||
}
|
|
||||||
|
|
||||||
json2args() {
|
|
||||||
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
|
||||||
# example:
|
|
||||||
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
|
||||||
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
|
||||||
local json_string=$1
|
|
||||||
local args=$(
|
|
||||||
echo "$json_string" | jq -r '
|
|
||||||
to_entries |
|
|
||||||
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
|
||||||
join(" ")
|
|
||||||
'
|
|
||||||
)
|
|
||||||
echo "$args"
|
|
||||||
}
|
|
||||||
|
|
||||||
wait_for_server() {
|
|
||||||
# wait for vllm server to start
|
|
||||||
# return 1 if vllm server crashes
|
|
||||||
timeout 1200 bash -c '
|
|
||||||
until curl -s localhost:8000/v1/completions > /dev/null; do
|
|
||||||
sleep 1
|
|
||||||
done' && return 0 || return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
run_serving_tests() {
|
|
||||||
# run serving tests using `benchmark_serving.py`
|
|
||||||
# $1: a json file specifying serving test cases
|
|
||||||
|
|
||||||
local serving_test_file
|
|
||||||
serving_test_file=$1
|
|
||||||
|
|
||||||
# Iterate over serving tests
|
|
||||||
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
|
||||||
# get the test name, and append the GPU type back to it.
|
|
||||||
test_name=$(echo "$params" | jq -r '.test_name')
|
|
||||||
|
|
||||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
|
||||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
|
||||||
echo "Skip test case $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
# append lmdeploy to the test name
|
|
||||||
test_name=lmdeploy_$test_name
|
|
||||||
|
|
||||||
# get common parameters
|
|
||||||
common_params=$(echo "$params" | jq -r '.common_parameters')
|
|
||||||
model=$(echo "$common_params" | jq -r '.model')
|
|
||||||
tp=$(echo "$common_params" | jq -r '.tp')
|
|
||||||
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
|
||||||
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
|
||||||
port=$(echo "$common_params" | jq -r '.port')
|
|
||||||
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# get client and server arguments
|
|
||||||
server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters')
|
|
||||||
client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters')
|
|
||||||
server_args=$(json2args "$server_params")
|
|
||||||
client_args=$(json2args "$client_params")
|
|
||||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
|
||||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
|
||||||
echo "Running over qps list $qps_list"
|
|
||||||
|
|
||||||
# check if there is enough GPU to run the test
|
|
||||||
if [[ $gpu_count -lt $tp ]]; then
|
|
||||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
# prepare tokenizer
|
|
||||||
rm -rf /tokenizer_cache
|
|
||||||
mkdir /tokenizer_cache
|
|
||||||
python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
|
|
||||||
--model "$model" \
|
|
||||||
--cachedir /tokenizer_cache
|
|
||||||
|
|
||||||
server_command="lmdeploy serve api_server $model \
|
|
||||||
--tp $tp \
|
|
||||||
--server-port $port \
|
|
||||||
$server_args"
|
|
||||||
|
|
||||||
# run the server
|
|
||||||
echo "Running test case $test_name"
|
|
||||||
echo "Server command: $server_command"
|
|
||||||
bash -c "$server_command" &
|
|
||||||
|
|
||||||
# wait until the server is alive
|
|
||||||
wait_for_server
|
|
||||||
if [ $? -eq 0 ]; then
|
|
||||||
echo ""
|
|
||||||
echo "lmdeploy server is up and running."
|
|
||||||
else
|
|
||||||
echo ""
|
|
||||||
echo "lmdeploy failed to start within the timeout period."
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
|
|
||||||
# get model name
|
|
||||||
model_name=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
|
|
||||||
|
|
||||||
# iterate over different QPS
|
|
||||||
for qps in $qps_list; do
|
|
||||||
# remove the surrounding single quote from qps
|
|
||||||
if [[ "$qps" == *"inf"* ]]; then
|
|
||||||
echo "qps was $qps"
|
|
||||||
qps="inf"
|
|
||||||
echo "now qps is $qps"
|
|
||||||
fi
|
|
||||||
|
|
||||||
new_test_name=$test_name"_qps_"$qps
|
|
||||||
|
|
||||||
client_command="python3 benchmark_serving.py \
|
|
||||||
--backend lmdeploy \
|
|
||||||
--tokenizer /tokenizer_cache \
|
|
||||||
--dataset-name $dataset_name \
|
|
||||||
--dataset-path $dataset_path \
|
|
||||||
--num-prompts $num_prompts \
|
|
||||||
--port $port \
|
|
||||||
--save-result \
|
|
||||||
--result-dir $RESULTS_FOLDER \
|
|
||||||
--result-filename ${new_test_name}.json \
|
|
||||||
--request-rate $qps \
|
|
||||||
--model \"$model_name\" \
|
|
||||||
$client_args"
|
|
||||||
|
|
||||||
echo "Running test case $test_name with qps $qps"
|
|
||||||
echo "Client command: $client_command"
|
|
||||||
|
|
||||||
eval "$client_command"
|
|
||||||
|
|
||||||
# record the benchmarking commands
|
|
||||||
jq_output=$(jq -n \
|
|
||||||
--arg server "$server_command" \
|
|
||||||
--arg client "$client_command" \
|
|
||||||
--arg gpu "$gpu_type" \
|
|
||||||
--arg engine "lmdeploy" \
|
|
||||||
'{
|
|
||||||
server_command: $server,
|
|
||||||
client_command: $client,
|
|
||||||
gpu_type: $gpu,
|
|
||||||
engine: $engine
|
|
||||||
}')
|
|
||||||
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
|
||||||
|
|
||||||
done
|
|
||||||
|
|
||||||
# clean up
|
|
||||||
kill_gpu_processes
|
|
||||||
rm -rf /root/.cache/huggingface/*
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
upload_to_buildkite() {
|
|
||||||
# upload the benchmarking results to buildkite
|
|
||||||
|
|
||||||
# if the agent binary is not found, skip uploading the results, exit 0
|
|
||||||
if [ ! -f /workspace/buildkite-agent ]; then
|
|
||||||
echo "buildkite-agent binary not found. Skip uploading the results."
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
|
|
||||||
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
main() {
|
|
||||||
|
|
||||||
check_gpus
|
|
||||||
# enter vllm directory
|
|
||||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
|
||||||
|
|
||||||
declare -g RESULTS_FOLDER=results/
|
|
||||||
mkdir -p $RESULTS_FOLDER
|
|
||||||
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
|
||||||
|
|
||||||
python -m pip install transformers==4.41.2
|
|
||||||
|
|
||||||
export CURRENT_LLM_SERVING_ENGINE=lmdeploy
|
|
||||||
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
|
|
||||||
python -m pip install tabulate pandas
|
|
||||||
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
|
|
||||||
upload_to_buildkite
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
main "$@"
|
|
||||||
357
.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
Normal file
357
.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
Normal file
@@ -0,0 +1,357 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -o pipefail
|
||||||
|
set -x
|
||||||
|
|
||||||
|
check_gpus() {
|
||||||
|
# check the number of GPUs and GPU type.
|
||||||
|
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||||
|
if [[ $gpu_count -gt 0 ]]; then
|
||||||
|
echo "GPU found."
|
||||||
|
else
|
||||||
|
echo "Need at least 1 GPU to run benchmarking."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
||||||
|
echo "GPU type is $gpu_type"
|
||||||
|
}
|
||||||
|
|
||||||
|
check_hf_token() {
|
||||||
|
# check if HF_TOKEN is available and valid
|
||||||
|
if [[ -z "$HF_TOKEN" ]]; then
|
||||||
|
echo "Error: HF_TOKEN is not set."
|
||||||
|
exit 1
|
||||||
|
elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
|
||||||
|
echo "Error: HF_TOKEN does not start with 'hf_'."
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
echo "HF_TOKEN is set and valid."
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
upload_to_buildkite() {
|
||||||
|
# upload the benchmarking results to buildkite
|
||||||
|
|
||||||
|
# if the agent binary is not found, skip uploading the results, exit 0
|
||||||
|
if [ ! -f /workspace/buildkite-agent ]; then
|
||||||
|
echo "buildkite-agent binary not found. Skip uploading the results."
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
|
||||||
|
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
get_current_llm_serving_engine() {
|
||||||
|
|
||||||
|
if which lmdeploy >/dev/null; then
|
||||||
|
echo "Container: lmdeploy"
|
||||||
|
export CURRENT_LLM_SERVING_ENGINE=lmdeploy
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -e /tgi-entrypoint.sh ]; then
|
||||||
|
echo "Container: tgi"
|
||||||
|
export CURRENT_LLM_SERVING_ENGINE=tgi
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
if which trtllm-build >/dev/null; then
|
||||||
|
echo "Container: tensorrt-llm"
|
||||||
|
export CURRENT_LLM_SERVING_ENGINE=trt
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -e /sgl-workspace ]; then
|
||||||
|
echo "Container: sglang"
|
||||||
|
export CURRENT_LLM_SERVING_ENGINE=sglang
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -e /vllm-workspace ]; then
|
||||||
|
echo "Container: vllm"
|
||||||
|
# move to a completely irrelevant directory, to avoid import vllm from current folder
|
||||||
|
export CURRENT_LLM_SERVING_ENGINE=vllm
|
||||||
|
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
json2args() {
|
||||||
|
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
||||||
|
# example:
|
||||||
|
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
||||||
|
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
||||||
|
local json_string=$1
|
||||||
|
local args=$(
|
||||||
|
echo "$json_string" | jq -r '
|
||||||
|
to_entries |
|
||||||
|
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
||||||
|
join(" ")
|
||||||
|
'
|
||||||
|
)
|
||||||
|
echo "$args"
|
||||||
|
}
|
||||||
|
|
||||||
|
kill_gpu_processes() {
|
||||||
|
pkill -f python
|
||||||
|
pkill -f python3
|
||||||
|
pkill -f tritonserver
|
||||||
|
pkill -f pt_main_thread
|
||||||
|
pkill -f text-generation
|
||||||
|
pkill -f lmdeploy
|
||||||
|
|
||||||
|
while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
wait_for_server() {
|
||||||
|
# wait for vllm server to start
|
||||||
|
# return 1 if vllm server crashes
|
||||||
|
timeout 1200 bash -c '
|
||||||
|
until curl -s localhost:8000/v1/completions > /dev/null; do
|
||||||
|
sleep 1
|
||||||
|
done' && return 0 || return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
ensure_installed() {
|
||||||
|
# Ensure that the given command is installed by apt-get
|
||||||
|
local cmd=$1
|
||||||
|
if ! which $cmd >/dev/null; then
|
||||||
|
apt-get update && apt-get install -y $cmd
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
run_serving_tests() {
|
||||||
|
# run serving tests using `benchmark_serving.py`
|
||||||
|
# $1: a json file specifying serving test cases
|
||||||
|
|
||||||
|
local serving_test_file
|
||||||
|
serving_test_file=$1
|
||||||
|
|
||||||
|
# Iterate over serving tests
|
||||||
|
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
||||||
|
# get the test name, and append the GPU type back to it.
|
||||||
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
|
|
||||||
|
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||||
|
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||||
|
echo "Skip test case $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# prepend the current serving engine to the test name
|
||||||
|
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
|
||||||
|
|
||||||
|
# get common parameters
|
||||||
|
common_params=$(echo "$params" | jq -r '.common_parameters')
|
||||||
|
model=$(echo "$common_params" | jq -r '.model')
|
||||||
|
tp=$(echo "$common_params" | jq -r '.tp')
|
||||||
|
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||||
|
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||||
|
port=$(echo "$common_params" | jq -r '.port')
|
||||||
|
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||||
|
reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
|
||||||
|
|
||||||
|
# get client and server arguments
|
||||||
|
server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
|
||||||
|
client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters")
|
||||||
|
client_args=$(json2args "$client_params")
|
||||||
|
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||||
|
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||||
|
echo "Running over qps list $qps_list"
|
||||||
|
|
||||||
|
# check if there is enough GPU to run the test
|
||||||
|
if [[ $gpu_count -lt $tp ]]; then
|
||||||
|
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $reuse_server == "true" ]]; then
|
||||||
|
echo "Reuse previous server for test case $test_name"
|
||||||
|
else
|
||||||
|
kill_gpu_processes
|
||||||
|
bash $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh \
|
||||||
|
"$server_params" "$common_params"
|
||||||
|
fi
|
||||||
|
|
||||||
|
wait_for_server
|
||||||
|
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
echo ""
|
||||||
|
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
|
||||||
|
else
|
||||||
|
echo ""
|
||||||
|
echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
|
||||||
|
# prepare tokenizer
|
||||||
|
# this is required for lmdeploy.
|
||||||
|
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||||
|
rm -rf /tokenizer_cache
|
||||||
|
mkdir /tokenizer_cache
|
||||||
|
python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
|
||||||
|
--model "$model" \
|
||||||
|
--cachedir /tokenizer_cache
|
||||||
|
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||||
|
|
||||||
|
|
||||||
|
# change model name for lmdeploy (it will not follow standard hf name)
|
||||||
|
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
|
||||||
|
model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
|
||||||
|
fi
|
||||||
|
|
||||||
|
# iterate over different QPS
|
||||||
|
for qps in $qps_list; do
|
||||||
|
# remove the surrounding single quote from qps
|
||||||
|
if [[ "$qps" == *"inf"* ]]; then
|
||||||
|
echo "qps was $qps"
|
||||||
|
qps="inf"
|
||||||
|
echo "now qps is $qps"
|
||||||
|
fi
|
||||||
|
|
||||||
|
new_test_name=$test_name"_qps_"$qps
|
||||||
|
|
||||||
|
backend=$CURRENT_LLM_SERVING_ENGINE
|
||||||
|
|
||||||
|
if [[ $backend = "trt" ]]; then
|
||||||
|
backend="tensorrt-llm"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$backend" == *"vllm"* ]]; then
|
||||||
|
backend="vllm"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$dataset_name" = "sharegpt" ]]; then
|
||||||
|
|
||||||
|
client_command="python3 benchmark_serving.py \
|
||||||
|
--backend $backend \
|
||||||
|
--tokenizer /tokenizer_cache \
|
||||||
|
--model $model \
|
||||||
|
--dataset-name $dataset_name \
|
||||||
|
--dataset-path $dataset_path \
|
||||||
|
--num-prompts $num_prompts \
|
||||||
|
--port $port \
|
||||||
|
--save-result \
|
||||||
|
--result-dir $RESULTS_FOLDER \
|
||||||
|
--result-filename ${new_test_name}.json \
|
||||||
|
--request-rate $qps \
|
||||||
|
--ignore-eos \
|
||||||
|
$client_args"
|
||||||
|
|
||||||
|
elif [[ "$dataset_name" = "sonnet" ]]; then
|
||||||
|
|
||||||
|
sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len')
|
||||||
|
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
|
||||||
|
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
|
||||||
|
|
||||||
|
client_command="python3 benchmark_serving.py \
|
||||||
|
--backend $backend \
|
||||||
|
--tokenizer /tokenizer_cache \
|
||||||
|
--model $model \
|
||||||
|
--dataset-name $dataset_name \
|
||||||
|
--dataset-path $dataset_path \
|
||||||
|
--num-prompts $num_prompts \
|
||||||
|
--sonnet-input-len $sonnet_input_len \
|
||||||
|
--sonnet-output-len $sonnet_output_len \
|
||||||
|
--sonnet-prefix-len $sonnet_prefix_len \
|
||||||
|
--port $port \
|
||||||
|
--save-result \
|
||||||
|
--result-dir $RESULTS_FOLDER \
|
||||||
|
--result-filename ${new_test_name}.json \
|
||||||
|
--request-rate $qps \
|
||||||
|
--ignore-eos \
|
||||||
|
$client_args"
|
||||||
|
|
||||||
|
else
|
||||||
|
|
||||||
|
echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
|
||||||
|
exit 1
|
||||||
|
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
echo "Running test case $test_name with qps $qps"
|
||||||
|
echo "Client command: $client_command"
|
||||||
|
|
||||||
|
eval "$client_command"
|
||||||
|
|
||||||
|
server_command="None"
|
||||||
|
|
||||||
|
# record the benchmarking commands
|
||||||
|
jq_output=$(jq -n \
|
||||||
|
--arg server "$server_command" \
|
||||||
|
--arg client "$client_command" \
|
||||||
|
--arg gpu "$gpu_type" \
|
||||||
|
--arg engine "$CURRENT_LLM_SERVING_ENGINE" \
|
||||||
|
'{
|
||||||
|
server_command: $server,
|
||||||
|
client_command: $client,
|
||||||
|
gpu_type: $gpu,
|
||||||
|
engine: $engine
|
||||||
|
}')
|
||||||
|
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
kill_gpu_processes
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
prepare_dataset() {
|
||||||
|
|
||||||
|
# download sharegpt dataset
|
||||||
|
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||||
|
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
|
||||||
|
# duplicate sonnet by 4x, to allow benchmarking with input length 2048
|
||||||
|
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||||
|
echo "" > sonnet_4x.txt
|
||||||
|
for _ in {1..4}
|
||||||
|
do
|
||||||
|
cat sonnet.txt >> sonnet_4x.txt
|
||||||
|
done
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
main() {
|
||||||
|
|
||||||
|
# check if the environment variable is successfully injected from yaml
|
||||||
|
|
||||||
|
check_gpus
|
||||||
|
check_hf_token
|
||||||
|
get_current_llm_serving_engine
|
||||||
|
|
||||||
|
pip install -U transformers
|
||||||
|
|
||||||
|
# check storage
|
||||||
|
df -h
|
||||||
|
|
||||||
|
ensure_installed wget
|
||||||
|
ensure_installed curl
|
||||||
|
ensure_installed jq
|
||||||
|
|
||||||
|
prepare_dataset
|
||||||
|
|
||||||
|
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||||
|
declare -g RESULTS_FOLDER=results/
|
||||||
|
mkdir -p $RESULTS_FOLDER
|
||||||
|
BENCHMARK_ROOT=$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/
|
||||||
|
|
||||||
|
# run the test
|
||||||
|
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
|
||||||
|
|
||||||
|
# upload benchmark results to buildkite
|
||||||
|
python3 -m pip install tabulate pandas
|
||||||
|
python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
|
||||||
|
upload_to_buildkite
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
@@ -1,216 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -o pipefail
|
|
||||||
|
|
||||||
check_gpus() {
|
|
||||||
# check the number of GPUs and GPU type.
|
|
||||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
|
||||||
if [[ $gpu_count -gt 0 ]]; then
|
|
||||||
echo "GPU found."
|
|
||||||
else
|
|
||||||
echo "Need at least 1 GPU to run benchmarking."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
|
||||||
echo "GPU type is $gpu_type"
|
|
||||||
}
|
|
||||||
|
|
||||||
kill_gpu_processes() {
|
|
||||||
pkill text-generation || true
|
|
||||||
# waiting for GPU processes to be fully killed
|
|
||||||
sleep 10
|
|
||||||
# Print the GPU memory usage
|
|
||||||
# so that we know if all GPU processes are killed.
|
|
||||||
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
|
|
||||||
# The memory usage should be 0 MB.
|
|
||||||
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
|
|
||||||
}
|
|
||||||
|
|
||||||
json2args() {
|
|
||||||
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
|
||||||
# example:
|
|
||||||
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
|
||||||
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
|
||||||
local json_string=$1
|
|
||||||
local args=$(
|
|
||||||
echo "$json_string" | jq -r '
|
|
||||||
to_entries |
|
|
||||||
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
|
||||||
join(" ")
|
|
||||||
'
|
|
||||||
)
|
|
||||||
echo "$args"
|
|
||||||
}
|
|
||||||
|
|
||||||
wait_for_server() {
|
|
||||||
timeout 1200 bash -c '
|
|
||||||
until curl -s localhost:8000/generate_stream > /dev/null; do
|
|
||||||
sleep 1
|
|
||||||
done' && return 0 || return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
run_serving_tests() {
|
|
||||||
# run serving tests using `benchmark_serving.py`
|
|
||||||
# $1: a json file specifying serving test cases
|
|
||||||
|
|
||||||
local serving_test_file
|
|
||||||
serving_test_file=$1
|
|
||||||
|
|
||||||
# Iterate over serving tests
|
|
||||||
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
|
||||||
# get the test name, and append the GPU type back to it.
|
|
||||||
test_name=$(echo "$params" | jq -r '.test_name')
|
|
||||||
|
|
||||||
|
|
||||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
|
||||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
|
||||||
echo "Skip test case $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
# append tgi to the test name
|
|
||||||
test_name=tgi_$test_name
|
|
||||||
|
|
||||||
# get common parameters
|
|
||||||
common_params=$(echo "$params" | jq -r '.common_parameters')
|
|
||||||
model=$(echo "$common_params" | jq -r '.model')
|
|
||||||
tp=$(echo "$common_params" | jq -r '.tp')
|
|
||||||
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
|
||||||
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
|
||||||
port=$(echo "$common_params" | jq -r '.port')
|
|
||||||
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
|
||||||
|
|
||||||
# get client and server arguments
|
|
||||||
server_params=$(echo "$params" | jq -r '.tgi_server_parameters')
|
|
||||||
client_params=$(echo "$params" | jq -r '.tgi_client_parameters')
|
|
||||||
server_args=$(json2args "$server_params")
|
|
||||||
client_args=$(json2args "$client_params")
|
|
||||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
|
||||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
|
||||||
echo "Running over qps list $qps_list"
|
|
||||||
|
|
||||||
# check if there is enough GPU to run the test
|
|
||||||
if [[ $gpu_count -lt $tp ]]; then
|
|
||||||
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
|
|
||||||
echo "Key 'fp8' exists in common params."
|
|
||||||
server_command="/tgi-entrypoint.sh \
|
|
||||||
--model-id $model \
|
|
||||||
--num-shard $tp \
|
|
||||||
--port $port \
|
|
||||||
--quantize fp8 \
|
|
||||||
$server_args"
|
|
||||||
else
|
|
||||||
echo "Key 'fp8' does not exist in common params."
|
|
||||||
server_command="/tgi-entrypoint.sh \
|
|
||||||
--model-id $model \
|
|
||||||
--num-shard $tp \
|
|
||||||
--port $port \
|
|
||||||
$server_args"
|
|
||||||
fi
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# run the server
|
|
||||||
echo "Running test case $test_name"
|
|
||||||
echo "Server command: $server_command"
|
|
||||||
eval "$server_command" &
|
|
||||||
|
|
||||||
# wait until the server is alive
|
|
||||||
wait_for_server
|
|
||||||
if [ $? -eq 0 ]; then
|
|
||||||
echo ""
|
|
||||||
echo "tgi server is up and running."
|
|
||||||
else
|
|
||||||
echo ""
|
|
||||||
echo "tgi failed to start within the timeout period."
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
|
|
||||||
# iterate over different QPS
|
|
||||||
for qps in $qps_list; do
|
|
||||||
# remove the surrounding single quote from qps
|
|
||||||
if [[ "$qps" == *"inf"* ]]; then
|
|
||||||
echo "qps was $qps"
|
|
||||||
qps="inf"
|
|
||||||
echo "now qps is $qps"
|
|
||||||
fi
|
|
||||||
|
|
||||||
new_test_name=$test_name"_qps_"$qps
|
|
||||||
|
|
||||||
client_command="python3 benchmark_serving.py \
|
|
||||||
--backend tgi \
|
|
||||||
--model $model \
|
|
||||||
--dataset-name $dataset_name \
|
|
||||||
--dataset-path $dataset_path \
|
|
||||||
--num-prompts $num_prompts \
|
|
||||||
--port $port \
|
|
||||||
--save-result \
|
|
||||||
--result-dir $RESULTS_FOLDER \
|
|
||||||
--result-filename ${new_test_name}.json \
|
|
||||||
--request-rate $qps \
|
|
||||||
$client_args"
|
|
||||||
|
|
||||||
echo "Running test case $test_name with qps $qps"
|
|
||||||
echo "Client command: $client_command"
|
|
||||||
|
|
||||||
eval "$client_command"
|
|
||||||
|
|
||||||
# record the benchmarking commands
|
|
||||||
jq_output=$(jq -n \
|
|
||||||
--arg server "$server_command" \
|
|
||||||
--arg client "$client_command" \
|
|
||||||
--arg gpu "$gpu_type" \
|
|
||||||
--arg engine "tgi" \
|
|
||||||
'{
|
|
||||||
server_command: $server,
|
|
||||||
client_command: $client,
|
|
||||||
gpu_type: $gpu,
|
|
||||||
engine: $engine
|
|
||||||
}')
|
|
||||||
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
|
||||||
|
|
||||||
done
|
|
||||||
|
|
||||||
# clean up
|
|
||||||
kill_gpu_processes
|
|
||||||
rm -rf /root/.cache/huggingface/*
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
upload_to_buildkite() {
|
|
||||||
# upload the benchmarking results to buildkite
|
|
||||||
|
|
||||||
# if the agent binary is not found, skip uploading the results, exit 0
|
|
||||||
if [ ! -f /workspace/buildkite-agent ]; then
|
|
||||||
echo "buildkite-agent binary not found. Skip uploading the results."
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
|
|
||||||
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
|
||||||
}
|
|
||||||
|
|
||||||
main() {
|
|
||||||
|
|
||||||
check_gpus
|
|
||||||
# enter vllm directory
|
|
||||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
|
||||||
declare -g RESULTS_FOLDER=results/
|
|
||||||
mkdir -p $RESULTS_FOLDER
|
|
||||||
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
|
||||||
|
|
||||||
export CURRENT_LLM_SERVING_ENGINE=tgi
|
|
||||||
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
|
|
||||||
python -m pip install tabulate pandas
|
|
||||||
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
|
|
||||||
upload_to_buildkite
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
main "$@"
|
|
||||||
@@ -1,214 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -o pipefail
|
|
||||||
|
|
||||||
check_gpus() {
|
|
||||||
# check the number of GPUs and GPU type.
|
|
||||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
|
||||||
if [[ $gpu_count -gt 0 ]]; then
|
|
||||||
echo "GPU found."
|
|
||||||
else
|
|
||||||
echo "Need at least 1 GPU to run benchmarking."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
|
||||||
echo "GPU type is $gpu_type"
|
|
||||||
}
|
|
||||||
|
|
||||||
kill_gpu_processes() {
|
|
||||||
pkill tritonserver || true
|
|
||||||
# waiting for GPU processes to be fully killed
|
|
||||||
sleep 20
|
|
||||||
# Print the GPU memory usage
|
|
||||||
# so that we know if all GPU processes are killed.
|
|
||||||
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
|
|
||||||
# The memory usage should be 0 MB.
|
|
||||||
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
|
|
||||||
}
|
|
||||||
|
|
||||||
json2args() {
|
|
||||||
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
|
||||||
# example:
|
|
||||||
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
|
||||||
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
|
||||||
local json_string=$1
|
|
||||||
local args=$(
|
|
||||||
echo "$json_string" | jq -r '
|
|
||||||
to_entries |
|
|
||||||
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
|
||||||
join(" ")
|
|
||||||
'
|
|
||||||
)
|
|
||||||
echo "$args"
|
|
||||||
}
|
|
||||||
|
|
||||||
wait_for_server() {
|
|
||||||
timeout 1200 bash -c '
|
|
||||||
until curl -s localhost:8000/generate_stream > /dev/null; do
|
|
||||||
sleep 1
|
|
||||||
done' && return 0 || return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
run_serving_tests() {
|
|
||||||
# run serving tests using `benchmark_serving.py`
|
|
||||||
# $1: a json file specifying serving test cases
|
|
||||||
|
|
||||||
local serving_test_file
|
|
||||||
serving_test_file=$1
|
|
||||||
|
|
||||||
# Iterate over serving tests
|
|
||||||
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
|
||||||
# get the test name, and append the GPU type back to it.
|
|
||||||
test_name=$(echo "$params" | jq -r '.test_name')
|
|
||||||
|
|
||||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
|
||||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
|
||||||
echo "Skip test case $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
# append trt to the test name
|
|
||||||
test_name=trt_$test_name
|
|
||||||
|
|
||||||
# get common parameters
|
|
||||||
common_params=$(echo "$params" | jq -r '.common_parameters')
|
|
||||||
model=$(echo "$common_params" | jq -r '.model')
|
|
||||||
tp=$(echo "$common_params" | jq -r '.tp')
|
|
||||||
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
|
||||||
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
|
||||||
port=$(echo "$common_params" | jq -r '.port')
|
|
||||||
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
|
||||||
|
|
||||||
# get client and server arguments
|
|
||||||
server_params=$(echo "$params" | jq -r '.trt_server_parameters')
|
|
||||||
client_params=$(echo "$params" | jq -r '.trt_client_parameters')
|
|
||||||
client_args=$(json2args "$client_params")
|
|
||||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
|
||||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
|
||||||
echo "Running over qps list $qps_list"
|
|
||||||
|
|
||||||
# check if there is enough GPU to run the test
|
|
||||||
if [[ $gpu_count -lt $tp ]]; then
|
|
||||||
echo "Required model_tp_size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
|
||||||
|
|
||||||
|
|
||||||
echo "Running test case $test_name"
|
|
||||||
bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh "$server_params" "$common_params"
|
|
||||||
|
|
||||||
# wait until the server is alive
|
|
||||||
wait_for_server
|
|
||||||
if [ $? -eq 0 ]; then
|
|
||||||
echo ""
|
|
||||||
echo "trt server is up and running."
|
|
||||||
else
|
|
||||||
echo ""
|
|
||||||
echo "trt failed to start within the timeout period."
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
|
|
||||||
# prepare tokenizer
|
|
||||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
|
||||||
rm -rf /tokenizer_cache
|
|
||||||
mkdir /tokenizer_cache
|
|
||||||
python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
|
|
||||||
--model "$model" \
|
|
||||||
--cachedir /tokenizer_cache
|
|
||||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
|
||||||
|
|
||||||
|
|
||||||
# iterate over different QPS
|
|
||||||
for qps in $qps_list; do
|
|
||||||
# remove the surrounding single quote from qps
|
|
||||||
if [[ "$qps" == *"inf"* ]]; then
|
|
||||||
echo "qps was $qps"
|
|
||||||
qps="inf"
|
|
||||||
echo "now qps is $qps"
|
|
||||||
fi
|
|
||||||
|
|
||||||
new_test_name=$test_name"_qps_"$qps
|
|
||||||
|
|
||||||
client_command="python3 benchmark_serving.py \
|
|
||||||
--backend tensorrt-llm \
|
|
||||||
--tokenizer /tokenizer_cache \
|
|
||||||
--model $model \
|
|
||||||
--dataset-name $dataset_name \
|
|
||||||
--dataset-path $dataset_path \
|
|
||||||
--num-prompts $num_prompts \
|
|
||||||
--port $port \
|
|
||||||
--save-result \
|
|
||||||
--result-dir $RESULTS_FOLDER \
|
|
||||||
--result-filename ${new_test_name}.json \
|
|
||||||
--request-rate $qps \
|
|
||||||
$client_args"
|
|
||||||
|
|
||||||
echo "Running test case $test_name with qps $qps"
|
|
||||||
echo "Client command: $client_command"
|
|
||||||
|
|
||||||
eval "$client_command"
|
|
||||||
|
|
||||||
server_command=""
|
|
||||||
# record the benchmarking commands
|
|
||||||
jq_output=$(jq -n \
|
|
||||||
--arg server "$server_command" \
|
|
||||||
--arg client "$client_command" \
|
|
||||||
--arg gpu "$gpu_type" \
|
|
||||||
--arg engine "trt" \
|
|
||||||
'{
|
|
||||||
server_command: $server,
|
|
||||||
client_command: $client,
|
|
||||||
gpu_type: $gpu,
|
|
||||||
engine: $engine
|
|
||||||
}')
|
|
||||||
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
|
||||||
|
|
||||||
done
|
|
||||||
|
|
||||||
# clean up
|
|
||||||
kill_gpu_processes
|
|
||||||
rm -rf /root/.cache/huggingface/*
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
upload_to_buildkite() {
|
|
||||||
# upload the benchmarking results to buildkite
|
|
||||||
|
|
||||||
# if the agent binary is not found, skip uploading the results, exit 0
|
|
||||||
if [ ! -f /workspace/buildkite-agent ]; then
|
|
||||||
echo "buildkite-agent binary not found. Skip uploading the results."
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
|
|
||||||
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
main() {
|
|
||||||
|
|
||||||
check_gpus
|
|
||||||
|
|
||||||
|
|
||||||
# enter vllm directory
|
|
||||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
|
||||||
|
|
||||||
declare -g RESULTS_FOLDER=results/
|
|
||||||
mkdir -p $RESULTS_FOLDER
|
|
||||||
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
|
||||||
|
|
||||||
# update transformers package, to make sure mixtral tokenizer is available
|
|
||||||
python -m pip install transformers -U
|
|
||||||
|
|
||||||
export CURRENT_LLM_SERVING_ENGINE=trt
|
|
||||||
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
|
|
||||||
python -m pip install tabulate pandas
|
|
||||||
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
|
|
||||||
upload_to_buildkite
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
main "$@"
|
|
||||||
@@ -1,221 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -o pipefail
|
|
||||||
|
|
||||||
check_gpus() {
|
|
||||||
# check the number of GPUs and GPU type.
|
|
||||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
|
||||||
if [[ $gpu_count -gt 0 ]]; then
|
|
||||||
echo "GPU found."
|
|
||||||
else
|
|
||||||
echo "Need at least 1 GPU to run benchmarking."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
|
||||||
echo "GPU type is $gpu_type"
|
|
||||||
}
|
|
||||||
|
|
||||||
kill_gpu_processes() {
|
|
||||||
# kill all processes on GPU.
|
|
||||||
pkill pt_main_thread
|
|
||||||
sleep 10
|
|
||||||
|
|
||||||
# remove vllm config file
|
|
||||||
rm -rf ~/.config/vllm
|
|
||||||
|
|
||||||
# Print the GPU memory usage
|
|
||||||
# so that we know if all GPU processes are killed.
|
|
||||||
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
|
|
||||||
# The memory usage should be 0 MB.
|
|
||||||
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
|
|
||||||
}
|
|
||||||
|
|
||||||
json2args() {
|
|
||||||
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
|
||||||
# example:
|
|
||||||
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
|
||||||
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
|
||||||
local json_string=$1
|
|
||||||
local args=$(
|
|
||||||
echo "$json_string" | jq -r '
|
|
||||||
to_entries |
|
|
||||||
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
|
||||||
join(" ")
|
|
||||||
'
|
|
||||||
)
|
|
||||||
echo "$args"
|
|
||||||
}
|
|
||||||
|
|
||||||
wait_for_server() {
|
|
||||||
# wait for vllm server to start
|
|
||||||
# return 1 if vllm server crashes
|
|
||||||
timeout 1200 bash -c '
|
|
||||||
until curl -s localhost:8000/v1/completions > /dev/null; do
|
|
||||||
sleep 1
|
|
||||||
done' && return 0 || return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
run_serving_tests() {
|
|
||||||
# run serving tests using `benchmark_serving.py`
|
|
||||||
# $1: a json file specifying serving test cases
|
|
||||||
|
|
||||||
local serving_test_file
|
|
||||||
serving_test_file=$1
|
|
||||||
|
|
||||||
# Iterate over serving tests
|
|
||||||
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
|
||||||
# get the test name, and append the GPU type back to it.
|
|
||||||
test_name=$(echo "$params" | jq -r '.test_name')
|
|
||||||
|
|
||||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
|
||||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
|
||||||
echo "Skip test case $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
# append vllm to the test name
|
|
||||||
test_name=vllm_$test_name
|
|
||||||
|
|
||||||
|
|
||||||
# get common parameters
|
|
||||||
common_params=$(echo "$params" | jq -r '.common_parameters')
|
|
||||||
model=$(echo "$common_params" | jq -r '.model')
|
|
||||||
tp=$(echo "$common_params" | jq -r '.tp')
|
|
||||||
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
|
||||||
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
|
||||||
port=$(echo "$common_params" | jq -r '.port')
|
|
||||||
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
|
||||||
|
|
||||||
# get client and server arguments
|
|
||||||
server_params=$(echo "$params" | jq -r '.vllm_server_parameters')
|
|
||||||
client_params=$(echo "$params" | jq -r '.vllm_client_parameters')
|
|
||||||
server_args=$(json2args "$server_params")
|
|
||||||
client_args=$(json2args "$client_params")
|
|
||||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
|
||||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
|
||||||
echo "Running over qps list $qps_list"
|
|
||||||
|
|
||||||
# check if there is enough GPU to run the test
|
|
||||||
if [[ $gpu_count -lt $tp ]]; then
|
|
||||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
|
|
||||||
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
|
||||||
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
|
||||||
server_command="python3 \
|
|
||||||
-m vllm.entrypoints.openai.api_server \
|
|
||||||
-tp $tp \
|
|
||||||
--model $model \
|
|
||||||
--port $port \
|
|
||||||
$server_args"
|
|
||||||
else
|
|
||||||
echo "Key 'fp8' does not exist in common params."
|
|
||||||
server_command="python3 \
|
|
||||||
-m vllm.entrypoints.openai.api_server \
|
|
||||||
-tp $tp \
|
|
||||||
--model $model \
|
|
||||||
--port $port \
|
|
||||||
$server_args"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# run the server
|
|
||||||
echo "Running test case $test_name"
|
|
||||||
echo "Server command: $server_command"
|
|
||||||
eval "$server_command" &
|
|
||||||
|
|
||||||
# wait until the server is alive
|
|
||||||
wait_for_server
|
|
||||||
if [ $? -eq 0 ]; then
|
|
||||||
echo ""
|
|
||||||
echo "vllm server is up and running."
|
|
||||||
else
|
|
||||||
echo ""
|
|
||||||
echo "vllm failed to start within the timeout period."
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
|
|
||||||
# iterate over different QPS
|
|
||||||
for qps in $qps_list; do
|
|
||||||
# remove the surrounding single quote from qps
|
|
||||||
if [[ "$qps" == *"inf"* ]]; then
|
|
||||||
echo "qps was $qps"
|
|
||||||
qps="inf"
|
|
||||||
echo "now qps is $qps"
|
|
||||||
fi
|
|
||||||
|
|
||||||
new_test_name=$test_name"_qps_"$qps
|
|
||||||
|
|
||||||
client_command="python3 benchmark_serving.py \
|
|
||||||
--backend vllm \
|
|
||||||
--model $model \
|
|
||||||
--dataset-name $dataset_name \
|
|
||||||
--dataset-path $dataset_path \
|
|
||||||
--num-prompts $num_prompts \
|
|
||||||
--port $port \
|
|
||||||
--save-result \
|
|
||||||
--result-dir $RESULTS_FOLDER \
|
|
||||||
--result-filename ${new_test_name}.json \
|
|
||||||
--request-rate $qps \
|
|
||||||
$client_args"
|
|
||||||
|
|
||||||
echo "Running test case $test_name with qps $qps"
|
|
||||||
echo "Client command: $client_command"
|
|
||||||
|
|
||||||
eval "$client_command"
|
|
||||||
|
|
||||||
# record the benchmarking commands
|
|
||||||
jq_output=$(jq -n \
|
|
||||||
--arg server "$server_command" \
|
|
||||||
--arg client "$client_command" \
|
|
||||||
--arg gpu "$gpu_type" \
|
|
||||||
--arg engine "vllm" \
|
|
||||||
'{
|
|
||||||
server_command: $server,
|
|
||||||
client_command: $client,
|
|
||||||
gpu_type: $gpu,
|
|
||||||
engine: $engine
|
|
||||||
}')
|
|
||||||
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
|
||||||
|
|
||||||
done
|
|
||||||
|
|
||||||
# clean up
|
|
||||||
kill_gpu_processes
|
|
||||||
rm -rf /root/.cache/huggingface/*
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
upload_to_buildkite() {
|
|
||||||
# upload the benchmarking results to buildkite
|
|
||||||
|
|
||||||
# if the agent binary is not found, skip uploading the results, exit 0
|
|
||||||
if [ ! -f /workspace/buildkite-agent ]; then
|
|
||||||
echo "buildkite-agent binary not found. Skip uploading the results."
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
|
|
||||||
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
|
||||||
}
|
|
||||||
|
|
||||||
main() {
|
|
||||||
|
|
||||||
check_gpus
|
|
||||||
# enter vllm directory
|
|
||||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
|
||||||
declare -g RESULTS_FOLDER=results/
|
|
||||||
mkdir -p $RESULTS_FOLDER
|
|
||||||
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
|
||||||
|
|
||||||
export CURRENT_LLM_SERVING_ENGINE=vllm
|
|
||||||
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
|
|
||||||
|
|
||||||
python3 -m pip install tabulate pandas
|
|
||||||
python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
|
|
||||||
upload_to_buildkite
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
main "$@"
|
|
||||||
@@ -17,10 +17,17 @@ serving_column_mapping = {
|
|||||||
"request_throughput": "Tput (req/s)",
|
"request_throughput": "Tput (req/s)",
|
||||||
"mean_ttft_ms": "Mean TTFT (ms)",
|
"mean_ttft_ms": "Mean TTFT (ms)",
|
||||||
"std_ttft_ms": "Std TTFT (ms)",
|
"std_ttft_ms": "Std TTFT (ms)",
|
||||||
|
"median_ttft_ms": "Median TTFT (ms)",
|
||||||
"mean_itl_ms": "Mean ITL (ms)",
|
"mean_itl_ms": "Mean ITL (ms)",
|
||||||
"std_itl_ms": "Std ITL (ms)",
|
"std_itl_ms": "Std ITL (ms)",
|
||||||
"input_throughput": "Input Tput (tok/s)",
|
"median_itl_ms": "Median ITL (ms)",
|
||||||
|
"mean_tpot_ms": "Mean TPOT (ms)",
|
||||||
|
"std_tpot_ms": "Std TPOT (ms)",
|
||||||
|
"median_tpot_ms": "Median TPOT (ms)",
|
||||||
|
"total_token_throughput": "Total Token Tput (tok/s)",
|
||||||
"output_throughput": "Output Tput (tok/s)",
|
"output_throughput": "Output Tput (tok/s)",
|
||||||
|
"total_input_tokens": "Total input tokens",
|
||||||
|
"total_output_tokens": "Total output tokens",
|
||||||
"engine": "Engine",
|
"engine": "Engine",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,16 +1,18 @@
|
|||||||
[
|
[
|
||||||
{
|
{
|
||||||
"test_name": "llama8B_tp1",
|
"test_name": "llama8B_tp1_sharegpt",
|
||||||
"qps_list": [4],
|
"qps_list": [4,8,16,32,"inf"],
|
||||||
"common_parameters": {
|
"common_parameters": {
|
||||||
"model": "meta-llama/Meta-Llama-3-8B",
|
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
|
||||||
"tp": 1,
|
"tp": 1,
|
||||||
"dataset_name": "sharegpt",
|
"dataset_name": "sharegpt",
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
"num_prompts": 500,
|
"num_prompts": 500,
|
||||||
"port": 8000
|
"port": 8000,
|
||||||
|
"reuse_server": false
|
||||||
},
|
},
|
||||||
"lmdeploy_server_parameters": {
|
"lmdeploy_server_parameters": {
|
||||||
|
"dtype": "bfloat16"
|
||||||
},
|
},
|
||||||
"lmdeploy_client_parameters": {
|
"lmdeploy_client_parameters": {
|
||||||
},
|
},
|
||||||
@@ -21,34 +23,158 @@
|
|||||||
},
|
},
|
||||||
"trt_server_parameters": {
|
"trt_server_parameters": {
|
||||||
"model_type": "llama",
|
"model_type": "llama",
|
||||||
"model_dtype": "float16",
|
"model_dtype": "bfloat16",
|
||||||
"max_batch_size": 256,
|
"max_batch_size": 2048,
|
||||||
"max_input_len": 4096,
|
"max_input_len": 4096,
|
||||||
"max_output_len": 4096,
|
"max_seq_len": 6144,
|
||||||
"trt_llm_version": "r24.04"
|
"max_num_tokens": 16384,
|
||||||
|
"trt_llm_version": "v0.11.0"
|
||||||
},
|
},
|
||||||
"trt_client_parameters": {
|
"trt_client_parameters": {
|
||||||
"endpoint": "/v2/models/ensemble/generate_stream"
|
"endpoint": "/v2/models/ensemble/generate_stream"
|
||||||
},
|
},
|
||||||
"vllm_server_parameters": {
|
"vllm_server_parameters": {
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"disable_log_requests": ""
|
"disable_log_requests": "",
|
||||||
|
"gpu_memory_utilization": 0.9,
|
||||||
|
"num_scheduler_steps": 10,
|
||||||
|
"max_num_seqs": 512,
|
||||||
|
"dtype": "bfloat16"
|
||||||
},
|
},
|
||||||
"vllm_client_parameters": {
|
"vllm_client_parameters": {
|
||||||
|
},
|
||||||
|
"sglang_server_parameters": {
|
||||||
|
"disable_radix_cache": "",
|
||||||
|
"enable_torch_compile": "",
|
||||||
|
"dtype": "bfloat16"
|
||||||
|
},
|
||||||
|
"sglang_client_parameters": {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "llama70B_tp4",
|
"test_name": "llama8B_tp1_sonnet_512_16",
|
||||||
"qps_list": [2],
|
"qps_list": [4,8,16,32,"inf"],
|
||||||
|
"common_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
|
||||||
|
"tp": 1,
|
||||||
|
"dataset_name": "sonnet",
|
||||||
|
"dataset_path": "./sonnet_4x.txt",
|
||||||
|
"num_prompts": 500,
|
||||||
|
"port": 8000,
|
||||||
|
"sonnet_input_len": 512,
|
||||||
|
"sonnet_output_len": 16,
|
||||||
|
"sonnet_prefix_len": 50,
|
||||||
|
"reuse_server": true
|
||||||
|
},
|
||||||
|
"lmdeploy_server_parameters": {
|
||||||
|
"dtype": "bfloat16"
|
||||||
|
},
|
||||||
|
"lmdeploy_client_parameters": {
|
||||||
|
},
|
||||||
|
"tgi_server_parameters": {
|
||||||
|
},
|
||||||
|
"tgi_client_parameters": {
|
||||||
|
"endpoint": "/generate_stream"
|
||||||
|
},
|
||||||
|
"trt_server_parameters": {
|
||||||
|
"model_type": "llama",
|
||||||
|
"model_dtype": "bfloat16",
|
||||||
|
"max_batch_size": 2048,
|
||||||
|
"max_input_len": 4096,
|
||||||
|
"max_seq_len": 6144,
|
||||||
|
"max_num_tokens": 16384,
|
||||||
|
"trt_llm_version": "v0.11.0"
|
||||||
|
},
|
||||||
|
"trt_client_parameters": {
|
||||||
|
"endpoint": "/v2/models/ensemble/generate_stream"
|
||||||
|
},
|
||||||
|
"vllm_server_parameters": {
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"gpu_memory_utilization": 0.9,
|
||||||
|
"num_scheduler_steps": 10,
|
||||||
|
"max_num_seqs": 512,
|
||||||
|
"dtype": "bfloat16"
|
||||||
|
},
|
||||||
|
"vllm_client_parameters": {
|
||||||
|
},
|
||||||
|
"sglang_server_parameters": {
|
||||||
|
"disable_radix_cache": "",
|
||||||
|
"enable_torch_compile": "",
|
||||||
|
"dtype": "bfloat16"
|
||||||
|
},
|
||||||
|
"sglang_client_parameters": {
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "llama8B_tp1_sonnet_512_256",
|
||||||
|
"qps_list": [4,8,16,32,"inf"],
|
||||||
|
"common_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
|
||||||
|
"tp": 1,
|
||||||
|
"dataset_name": "sonnet",
|
||||||
|
"dataset_path": "./sonnet_4x.txt",
|
||||||
|
"num_prompts": 500,
|
||||||
|
"port": 8000,
|
||||||
|
"sonnet_input_len": 512,
|
||||||
|
"sonnet_output_len": 256,
|
||||||
|
"sonnet_prefix_len": 50,
|
||||||
|
"reuse_server": true
|
||||||
|
},
|
||||||
|
"lmdeploy_server_parameters": {
|
||||||
|
"dtype": "bfloat16"
|
||||||
|
},
|
||||||
|
"lmdeploy_client_parameters": {
|
||||||
|
},
|
||||||
|
"tgi_server_parameters": {
|
||||||
|
},
|
||||||
|
"tgi_client_parameters": {
|
||||||
|
"endpoint": "/generate_stream"
|
||||||
|
},
|
||||||
|
"trt_server_parameters": {
|
||||||
|
"model_type": "llama",
|
||||||
|
"model_dtype": "bfloat16",
|
||||||
|
"max_batch_size": 2048,
|
||||||
|
"max_input_len": 4096,
|
||||||
|
"max_seq_len": 6144,
|
||||||
|
"max_num_tokens": 16384,
|
||||||
|
"trt_llm_version": "v0.11.0"
|
||||||
|
},
|
||||||
|
"trt_client_parameters": {
|
||||||
|
"endpoint": "/v2/models/ensemble/generate_stream"
|
||||||
|
},
|
||||||
|
"vllm_server_parameters": {
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"gpu_memory_utilization": 0.9,
|
||||||
|
"num_scheduler_steps": 10,
|
||||||
|
"max_num_seqs": 512,
|
||||||
|
"dtype": "bfloat16"
|
||||||
|
},
|
||||||
|
"vllm_client_parameters": {
|
||||||
|
},
|
||||||
|
"sglang_server_parameters": {
|
||||||
|
"disable_radix_cache": "",
|
||||||
|
"enable_torch_compile": "",
|
||||||
|
"dtype": "bfloat16"
|
||||||
|
},
|
||||||
|
"sglang_client_parameters": {
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "llama70B_tp4_sharegpt",
|
||||||
|
"qps_list": [4,8,16,32,"inf"],
|
||||||
"common_parameters": {
|
"common_parameters": {
|
||||||
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||||
"tp": 4,
|
"tp": 4,
|
||||||
"dataset_name": "sharegpt",
|
"dataset_name": "sharegpt",
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
"num_prompts": 500,
|
"num_prompts": 500,
|
||||||
"port": 8000
|
"port": 8000,
|
||||||
|
"reuse_server": false
|
||||||
},
|
},
|
||||||
"lmdeploy_server_parameters": {
|
"lmdeploy_server_parameters": {
|
||||||
|
"dtype": "bfloat16"
|
||||||
},
|
},
|
||||||
"lmdeploy_client_parameters": {
|
"lmdeploy_client_parameters": {
|
||||||
},
|
},
|
||||||
@@ -59,34 +185,50 @@
|
|||||||
},
|
},
|
||||||
"trt_server_parameters": {
|
"trt_server_parameters": {
|
||||||
"model_type": "llama",
|
"model_type": "llama",
|
||||||
"model_dtype": "float16",
|
"model_dtype": "bfloat16",
|
||||||
"max_batch_size": 256,
|
"max_batch_size": 2048,
|
||||||
"max_input_len": 4096,
|
"max_input_len": 4096,
|
||||||
"max_output_len": 4096,
|
"max_seq_len": 6144,
|
||||||
"trt_llm_version": "r24.04"
|
"max_num_tokens": 16384,
|
||||||
|
"trt_llm_version": "v0.11.0"
|
||||||
},
|
},
|
||||||
"trt_client_parameters": {
|
"trt_client_parameters": {
|
||||||
"endpoint": "/v2/models/ensemble/generate_stream"
|
"endpoint": "/v2/models/ensemble/generate_stream"
|
||||||
},
|
},
|
||||||
"vllm_server_parameters": {
|
"vllm_server_parameters": {
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"disable_log_requests": ""
|
"disable_log_requests": "",
|
||||||
|
"gpu_memory_utilization": 0.9,
|
||||||
|
"num_scheduler_steps": 10,
|
||||||
|
"max_num_seqs": 512,
|
||||||
|
"dtype": "bfloat16"
|
||||||
},
|
},
|
||||||
"vllm_client_parameters": {
|
"vllm_client_parameters": {
|
||||||
|
},
|
||||||
|
"sglang_server_parameters": {
|
||||||
|
"disable_radix_cache": "",
|
||||||
|
"dtype": "bfloat16"
|
||||||
|
},
|
||||||
|
"sglang_client_parameters": {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "mixtral8x7B_tp2",
|
"test_name": "llama70B_tp4_sonnet_512_16",
|
||||||
"qps_list": [2],
|
"qps_list": [4,8,16,32,"inf"],
|
||||||
"common_parameters": {
|
"common_parameters": {
|
||||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||||
"tp": 2,
|
"tp": 4,
|
||||||
"dataset_name": "sharegpt",
|
"dataset_name": "sonnet",
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
"dataset_path": "./sonnet_4x.txt",
|
||||||
"num_prompts": 500,
|
"num_prompts": 500,
|
||||||
"port": 8000
|
"port": 8000,
|
||||||
|
"sonnet_input_len": 512,
|
||||||
|
"sonnet_output_len": 16,
|
||||||
|
"sonnet_prefix_len": 50,
|
||||||
|
"reuse_server": true
|
||||||
},
|
},
|
||||||
"lmdeploy_server_parameters": {
|
"lmdeploy_server_parameters": {
|
||||||
|
"dtype": "bfloat16"
|
||||||
},
|
},
|
||||||
"lmdeploy_client_parameters": {
|
"lmdeploy_client_parameters": {
|
||||||
},
|
},
|
||||||
@@ -97,20 +239,85 @@
|
|||||||
},
|
},
|
||||||
"trt_server_parameters": {
|
"trt_server_parameters": {
|
||||||
"model_type": "llama",
|
"model_type": "llama",
|
||||||
"model_dtype": "float16",
|
"model_dtype": "bfloat16",
|
||||||
"max_batch_size": 256,
|
"max_batch_size": 2048,
|
||||||
"max_input_len": 4096,
|
"max_input_len": 4096,
|
||||||
"max_output_len": 4096,
|
"max_seq_len": 6144,
|
||||||
"trt_llm_version": "r24.04"
|
"max_num_tokens": 16384,
|
||||||
|
"trt_llm_version": "v0.11.0"
|
||||||
},
|
},
|
||||||
"trt_client_parameters": {
|
"trt_client_parameters": {
|
||||||
"endpoint": "/v2/models/ensemble/generate_stream"
|
"endpoint": "/v2/models/ensemble/generate_stream"
|
||||||
},
|
},
|
||||||
"vllm_server_parameters": {
|
"vllm_server_parameters": {
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"disable_log_requests": ""
|
"disable_log_requests": "",
|
||||||
|
"gpu_memory_utilization": 0.9,
|
||||||
|
"num_scheduler_steps": 10,
|
||||||
|
"max_num_seqs": 512,
|
||||||
|
"dtype": "bfloat16"
|
||||||
},
|
},
|
||||||
"vllm_client_parameters": {
|
"vllm_client_parameters": {
|
||||||
|
},
|
||||||
|
"sglang_server_parameters": {
|
||||||
|
"disable_radix_cache": "",
|
||||||
|
"dtype": "bfloat16"
|
||||||
|
},
|
||||||
|
"sglang_client_parameters": {
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "llama70B_tp4_sonnet_512_256",
|
||||||
|
"qps_list": [4,8,16,32,"inf"],
|
||||||
|
"common_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||||
|
"tp": 4,
|
||||||
|
"dataset_name": "sonnet",
|
||||||
|
"dataset_path": "./sonnet_4x.txt",
|
||||||
|
"num_prompts": 500,
|
||||||
|
"port": 8000,
|
||||||
|
"sonnet_input_len": 512,
|
||||||
|
"sonnet_output_len": 256,
|
||||||
|
"sonnet_prefix_len": 50,
|
||||||
|
"reuse_server": true
|
||||||
|
},
|
||||||
|
"lmdeploy_server_parameters": {
|
||||||
|
"dtype": "bfloat16"
|
||||||
|
},
|
||||||
|
"lmdeploy_client_parameters": {
|
||||||
|
},
|
||||||
|
"tgi_server_parameters": {
|
||||||
|
},
|
||||||
|
"tgi_client_parameters": {
|
||||||
|
"endpoint": "/generate_stream"
|
||||||
|
},
|
||||||
|
"trt_server_parameters": {
|
||||||
|
"model_type": "llama",
|
||||||
|
"model_dtype": "bfloat16",
|
||||||
|
"max_batch_size": 2048,
|
||||||
|
"max_input_len": 4096,
|
||||||
|
"max_seq_len": 6144,
|
||||||
|
"max_num_tokens": 16384,
|
||||||
|
"trt_llm_version": "v0.11.0"
|
||||||
|
},
|
||||||
|
"trt_client_parameters": {
|
||||||
|
"endpoint": "/v2/models/ensemble/generate_stream"
|
||||||
|
},
|
||||||
|
"vllm_server_parameters": {
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"gpu_memory_utilization": 0.9,
|
||||||
|
"num_scheduler_steps": 10,
|
||||||
|
"max_num_seqs": 512,
|
||||||
|
"dtype": "bfloat16"
|
||||||
|
},
|
||||||
|
"vllm_client_parameters": {
|
||||||
|
},
|
||||||
|
"sglang_server_parameters": {
|
||||||
|
"disable_radix_cache": "",
|
||||||
|
"dtype": "bfloat16"
|
||||||
|
},
|
||||||
|
"sglang_client_parameters": {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
@@ -3,13 +3,14 @@ steps:
|
|||||||
agents:
|
agents:
|
||||||
queue: cpu_queue
|
queue: cpu_queue
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
# rename the files to change linux -> manylinux1
|
# rename the files to change linux -> manylinux1
|
||||||
- "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
|
- "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
|
||||||
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
- "mv artifacts/dist/$(ls artifacts/dist) artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
|
||||||
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
|
- "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/$BUILDKITE_COMMIT/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
|
||||||
|
- "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
@@ -21,7 +22,7 @@ steps:
|
|||||||
agents:
|
agents:
|
||||||
queue: cpu_queue
|
queue: cpu_queue
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
# rename the files to change linux -> manylinux1
|
# rename the files to change linux -> manylinux1
|
||||||
|
|||||||
@@ -18,7 +18,13 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg
|
|||||||
# Run basic model test
|
# Run basic model test
|
||||||
docker exec cpu-test bash -c "
|
docker exec cpu-test bash -c "
|
||||||
pip install pytest matplotlib einops transformers_stream_generator
|
pip install pytest matplotlib einops transformers_stream_generator
|
||||||
pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
|
pytest -v -s tests/models -m \"not vlm\" \
|
||||||
|
--ignore=tests/models/test_embedding.py \
|
||||||
|
--ignore=tests/models/test_oot_registration.py \
|
||||||
|
--ignore=tests/models/test_registry.py \
|
||||||
|
--ignore=tests/models/test_jamba.py \
|
||||||
|
--ignore=tests/models/test_mamba.py \
|
||||||
|
--ignore=tests/models/test_danube3_4b.py" # Mamba kernels and Danube3-4B on CPU is not supported
|
||||||
|
|
||||||
# online inference
|
# online inference
|
||||||
docker exec cpu-test bash -c "
|
docker exec cpu-test bash -c "
|
||||||
|
|||||||
@@ -23,16 +23,24 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
|
|||||||
# Run basic model test
|
# Run basic model test
|
||||||
docker exec cpu-test bash -c "
|
docker exec cpu-test bash -c "
|
||||||
pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
|
pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
|
||||||
|
pytest -v -s tests/models/encoder_decoder/language
|
||||||
pytest -v -s tests/models/decoder_only/language \
|
pytest -v -s tests/models/decoder_only/language \
|
||||||
--ignore=tests/models/test_fp8.py \
|
--ignore=tests/models/test_fp8.py \
|
||||||
--ignore=tests/models/decoder_only/language/test_jamba.py \
|
--ignore=tests/models/decoder_only/language/test_jamba.py \
|
||||||
|
--ignore=tests/models/decoder_only/language/test_mamba.py \
|
||||||
|
--ignore=tests/models/decoder_only/language/test_granitemoe.py \
|
||||||
--ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
|
--ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
|
||||||
|
|
||||||
# Run compressed-tensor test
|
# Run compressed-tensor test
|
||||||
|
# docker exec cpu-test bash -c "
|
||||||
|
# pytest -s -v \
|
||||||
|
# tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
|
||||||
|
# tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynanmic_per_token"
|
||||||
|
|
||||||
|
# Run AWQ test
|
||||||
docker exec cpu-test bash -c "
|
docker exec cpu-test bash -c "
|
||||||
pytest -s -v \
|
pytest -s -v \
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
|
tests/quantization/test_ipex_quant.py"
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynanmic_per_token"
|
|
||||||
|
|
||||||
# online inference
|
# online inference
|
||||||
docker exec cpu-test bash -c "
|
docker exec cpu-test bash -c "
|
||||||
|
|||||||
@@ -11,4 +11,4 @@ trap remove_docker_container EXIT
|
|||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Run the image and launch offline inference
|
# Run the image and launch offline inference
|
||||||
docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py
|
docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test python3 examples/offline_inference.py
|
||||||
|
|||||||
@@ -9,6 +9,7 @@
|
|||||||
# label(str): the name of the test. emoji allowed.
|
# label(str): the name of the test. emoji allowed.
|
||||||
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
|
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
|
||||||
# fast_check_only(bool): run this test on fastcheck pipeline only
|
# fast_check_only(bool): run this test on fastcheck pipeline only
|
||||||
|
# optional(bool): never run this test by default (i.e. need to unblock manually)
|
||||||
# command(str): the single command to run for tests. incompatible with commands.
|
# command(str): the single command to run for tests. incompatible with commands.
|
||||||
# commands(list): the list of commands to run for test. incompatbile with command.
|
# commands(list): the list of commands to run for test. incompatbile with command.
|
||||||
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
|
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
|
||||||
@@ -39,7 +40,7 @@ steps:
|
|||||||
# Check API reference (if it fails, you may have missing mock imports)
|
# Check API reference (if it fails, you may have missing mock imports)
|
||||||
- grep \"sig sig-object py\" build/html/dev/sampling_params.html
|
- grep \"sig sig-object py\" build/html/dev/sampling_params.html
|
||||||
|
|
||||||
- label: Async Engine, Inputs, Utils, Worker Test # 15min
|
- label: Async Engine, Inputs, Utils, Worker Test # 24min
|
||||||
fast_check: true
|
fast_check: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
@@ -63,14 +64,22 @@ steps:
|
|||||||
fast_check: true
|
fast_check: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/basic_correctness
|
- tests/basic_correctness/test_basic_correctness
|
||||||
|
- tests/basic_correctness/test_cpu_offload
|
||||||
|
- tests/basic_correctness/test_preemption
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s basic_correctness/test_basic_correctness.py
|
- pytest -v -s basic_correctness/test_basic_correctness.py
|
||||||
- pytest -v -s basic_correctness/test_cpu_offload.py
|
- pytest -v -s basic_correctness/test_cpu_offload.py
|
||||||
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
|
|
||||||
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
|
|
||||||
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
||||||
|
|
||||||
|
- label: Chunked Prefill Test
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/basic_correctness/test_chunked_prefill
|
||||||
|
commands:
|
||||||
|
- VLLM_ATTENTION_BACKEND=XFORMERS VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||||
|
- VLLM_ATTENTION_BACKEND=FLASH_ATTN VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||||
|
|
||||||
- label: Core Test # 10min
|
- label: Core Test # 10min
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
fast_check: true
|
fast_check: true
|
||||||
@@ -79,9 +88,13 @@ steps:
|
|||||||
- vllm/distributed
|
- vllm/distributed
|
||||||
- tests/core
|
- tests/core
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s core
|
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core/test_scheduler.py
|
||||||
|
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/test_chunked_prefill_scheduler.py
|
||||||
|
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/block/e2e/test_correctness.py
|
||||||
|
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/block/e2e/test_correctness_sliding_window.py
|
||||||
|
- pytest -v -s core --ignore=core/block/e2e/test_correctness.py --ignore=core/test_scheduler.py --ignore=core/test_chunked_prefill_scheduler.py --ignore=core/block/e2e/test_correctness.py --ignore=core/block/e2e/test_correctness_sliding_window.py
|
||||||
|
|
||||||
- label: Entrypoints Test # 20min
|
- label: Entrypoints Test # 40min
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
fast_check: true
|
fast_check: true
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
@@ -89,13 +102,13 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
commands:
|
commands:
|
||||||
- pip install -e ./plugins/vllm_add_dummy_model
|
- pip install -e ./plugins/vllm_add_dummy_model
|
||||||
- pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api]
|
|
||||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
|
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
|
||||||
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/openai
|
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
|
||||||
|
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/test_chat_utils.py
|
- pytest -v -s entrypoints/test_chat_utils.py
|
||||||
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||||
|
|
||||||
@@ -108,7 +121,9 @@ steps:
|
|||||||
- vllm/core/
|
- vllm/core/
|
||||||
- tests/distributed
|
- tests/distributed
|
||||||
- tests/spec_decode/e2e/test_integration_dist_tp4
|
- tests/spec_decode/e2e/test_integration_dist_tp4
|
||||||
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
|
- pytest -v -s compile/test_basic_correctness.py
|
||||||
- pytest -v -s distributed/test_pynccl.py
|
- pytest -v -s distributed/test_pynccl.py
|
||||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
||||||
|
|
||||||
@@ -136,7 +151,9 @@ steps:
|
|||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/test_regression
|
- tests/test_regression
|
||||||
command: pytest -v -s test_regression.py
|
commands:
|
||||||
|
- pip install modelscope
|
||||||
|
- pytest -v -s test_regression.py
|
||||||
working_dir: "/vllm-workspace/tests" # optional
|
working_dir: "/vllm-workspace/tests" # optional
|
||||||
|
|
||||||
- label: Engine Test # 10min
|
- label: Engine Test # 10min
|
||||||
@@ -150,7 +167,7 @@ steps:
|
|||||||
# OOM in the CI unless we run this separately
|
# OOM in the CI unless we run this separately
|
||||||
- pytest -v -s tokenization
|
- pytest -v -s tokenization
|
||||||
|
|
||||||
- label: Examples Test # 12min
|
- label: Examples Test # 15min
|
||||||
working_dir: "/vllm-workspace/examples"
|
working_dir: "/vllm-workspace/examples"
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@@ -168,15 +185,16 @@ steps:
|
|||||||
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||||
- python3 offline_inference_encoder_decoder.py
|
- python3 offline_inference_encoder_decoder.py
|
||||||
|
|
||||||
- label: Prefix Caching Test # 7min
|
- label: Prefix Caching Test # 9min
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/prefix_caching
|
- tests/prefix_caching
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s prefix_caching
|
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s prefix_caching/test_prefix_caching.py
|
||||||
|
- pytest -v -s prefix_caching --ignore=prefix_caching/test_prefix_caching.py
|
||||||
|
|
||||||
- label: Samplers Test # 18min
|
- label: Samplers Test # 36min
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/model_executor/layers
|
- vllm/model_executor/layers
|
||||||
- vllm/sampling_metadata.py
|
- vllm/sampling_metadata.py
|
||||||
@@ -192,17 +210,16 @@ steps:
|
|||||||
- tests/test_logits_processor
|
- tests/test_logits_processor
|
||||||
command: pytest -v -s test_logits_processor.py
|
command: pytest -v -s test_logits_processor.py
|
||||||
|
|
||||||
- label: Speculative decoding tests # 22min
|
- label: Speculative decoding tests # 30min
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/spec_decode
|
- vllm/spec_decode
|
||||||
- tests/spec_decode
|
- tests/spec_decode
|
||||||
commands:
|
commands:
|
||||||
# See https://github.com/vllm-project/vllm/issues/5152
|
|
||||||
- export VLLM_ATTENTION_BACKEND=XFORMERS
|
|
||||||
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
|
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
|
||||||
- pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
|
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s spec_decode/e2e/test_compatibility.py
|
||||||
|
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_compatibility.py
|
||||||
|
|
||||||
- label: LoRA Test %N # 30min each
|
- label: LoRA Test %N # 15min each
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/lora
|
- vllm/lora
|
||||||
@@ -210,22 +227,24 @@ steps:
|
|||||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
|
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
|
||||||
parallelism: 4
|
parallelism: 4
|
||||||
|
|
||||||
- label: "PyTorch Fullgraph Smoke Test"
|
- label: "PyTorch Fullgraph Smoke Test" # 9min
|
||||||
fast_check: true
|
fast_check: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s compile/test_full_graph_smoke.py
|
- pytest -v -s compile/test_basic_correctness.py
|
||||||
|
|
||||||
- label: "PyTorch Fullgraph Test"
|
# TODO: re-write in comparison tests, and fix symbolic shape
|
||||||
source_file_dependencies:
|
# for quantization ops.
|
||||||
- vllm/
|
# - label: "PyTorch Fullgraph Test" # 18min
|
||||||
- tests/compile
|
# source_file_dependencies:
|
||||||
commands:
|
# - vllm/
|
||||||
- pytest -v -s compile/test_full_graph.py
|
# - tests/compile
|
||||||
|
# commands:
|
||||||
|
# - pytest -v -s compile/test_full_graph.py
|
||||||
|
|
||||||
- label: Kernels Test %N # 30min each
|
- label: Kernels Test %N # 1h each
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
@@ -255,12 +274,12 @@ steps:
|
|||||||
- pip install aiohttp
|
- pip install aiohttp
|
||||||
- bash run-benchmarks.sh
|
- bash run-benchmarks.sh
|
||||||
|
|
||||||
- label: Quantization Test # 15min
|
- label: Quantization Test # 33min
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
- tests/quantization
|
- tests/quantization
|
||||||
command: pytest -v -s quantization
|
command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
||||||
|
|
||||||
- label: LM Eval Small Models # 53min
|
- label: LM Eval Small Models # 53min
|
||||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
@@ -268,7 +287,6 @@ steps:
|
|||||||
- csrc/
|
- csrc/
|
||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
commands:
|
commands:
|
||||||
- pip install lm-eval
|
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- bash ./run-tests.sh -c configs/models-small.txt -t 1
|
- bash ./run-tests.sh -c configs/models-small.txt -t 1
|
||||||
|
|
||||||
@@ -299,7 +317,7 @@ steps:
|
|||||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||||
- pytest -v -s models/*.py --ignore=models/test_oot_registration.py
|
- pytest -v -s models/*.py --ignore=models/test_oot_registration.py
|
||||||
|
|
||||||
- label: Decoder-only Language Models Test # 1h3min
|
- label: Decoder-only Language Models Test # 1h36min
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
@@ -307,7 +325,7 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -v -s models/decoder_only/language
|
- pytest -v -s models/decoder_only/language
|
||||||
|
|
||||||
- label: Decoder-only Multi-Modal Models Test # 56min
|
- label: Decoder-only Multi-Modal Models Test # 1h31min
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
@@ -317,15 +335,26 @@ steps:
|
|||||||
- pytest -v -s models/decoder_only/audio_language
|
- pytest -v -s models/decoder_only/audio_language
|
||||||
- pytest -v -s models/decoder_only/vision_language
|
- pytest -v -s models/decoder_only/vision_language
|
||||||
|
|
||||||
- label: Other Models Test # 5min
|
- label: Other Models Test # 6min
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/embedding/language
|
- tests/models/embedding/language
|
||||||
- tests/models/encoder_decoder/language
|
- tests/models/encoder_decoder/language
|
||||||
|
- tests/models/encoder_decoder/vision_language
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s models/embedding/language
|
- pytest -v -s models/embedding/language
|
||||||
- pytest -v -s models/encoder_decoder/language
|
- pytest -v -s models/encoder_decoder/language
|
||||||
|
- pytest -v -s models/encoder_decoder/vision_language
|
||||||
|
|
||||||
|
# This test is used only in PR development phase to test individual models and should never run on main
|
||||||
|
- label: Custom Models Test
|
||||||
|
optional: true
|
||||||
|
commands:
|
||||||
|
- echo 'Testing custom models...'
|
||||||
|
# PR authors can temporarily add commands below to test individual models
|
||||||
|
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
|
||||||
|
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
|
||||||
|
|
||||||
##### 1 GPU test #####
|
##### 1 GPU test #####
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
@@ -358,7 +387,7 @@ steps:
|
|||||||
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
|
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
|
||||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
|
||||||
|
|
||||||
- label: Distributed Tests (2 GPUs) # 28min
|
- label: Distributed Tests (2 GPUs) # 40min
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
@@ -370,19 +399,21 @@ steps:
|
|||||||
- tests/distributed/
|
- tests/distributed/
|
||||||
- vllm/compilation
|
- vllm/compilation
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s ./compile/test_full_graph_multi_gpu.py
|
- pytest -v -s ./compile/test_basic_correctness.py
|
||||||
- pytest -v -s ./compile/test_wrapper.py
|
- pytest -v -s ./compile/test_wrapper.py
|
||||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
|
||||||
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
|
- TARGET_TEST_SUITE=L4 VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest basic_correctness/ -v -s -m distributed_2_gpus
|
||||||
# Avoid importing model tests that cause CUDA reinitialization error
|
# Avoid importing model tests that cause CUDA reinitialization error
|
||||||
- pytest models/encoder_decoder/language/test_bart.py models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
|
- pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
|
||||||
|
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
|
||||||
|
- pytest models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
|
||||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
||||||
- pip install -e ./plugins/vllm_add_dummy_model
|
- pip install -e ./plugins/vllm_add_dummy_model
|
||||||
- pytest -v -s distributed/test_distributed_oot.py
|
- pytest -v -s distributed/test_distributed_oot.py
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
|
||||||
|
|
||||||
- label: Multi-step Tests (4 GPUs) # 21min
|
- label: Multi-step Tests (4 GPUs) # 36min
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@@ -400,7 +431,7 @@ steps:
|
|||||||
- pytest -v -s multi_step/test_correctness_async_llm.py
|
- pytest -v -s multi_step/test_correctness_async_llm.py
|
||||||
- pytest -v -s multi_step/test_correctness_llm.py
|
- pytest -v -s multi_step/test_correctness_llm.py
|
||||||
|
|
||||||
- label: Pipeline Parallelism Test # 23min
|
- label: Pipeline Parallelism Test # 45min
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@@ -426,7 +457,7 @@ steps:
|
|||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s -x lora/test_long_context.py
|
- pytest -v -s -x lora/test_long_context.py
|
||||||
|
|
||||||
- label: Weight Loading Multiple GPU Test
|
- label: Weight Loading Multiple GPU Test # 33min
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@@ -459,7 +490,7 @@ steps:
|
|||||||
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
||||||
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
||||||
- pytest -v -s distributed/test_custom_all_reduce.py
|
- pytest -v -s distributed/test_custom_all_reduce.py
|
||||||
- TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py
|
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus
|
||||||
- pytest -v -s -x lora/test_mixtral.py
|
- pytest -v -s -x lora/test_mixtral.py
|
||||||
|
|
||||||
- label: LM Eval Large Models # optional
|
- label: LM Eval Large Models # optional
|
||||||
@@ -470,6 +501,5 @@ steps:
|
|||||||
- csrc/
|
- csrc/
|
||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
commands:
|
commands:
|
||||||
- pip install lm-eval
|
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- bash ./run-tests.sh -c configs/models-large.txt -t 4
|
- bash ./run-tests.sh -c configs/models-large.txt -t 4
|
||||||
|
|||||||
@@ -1,4 +1,34 @@
|
|||||||
vllm/*.so
|
/.github/
|
||||||
/.venv
|
/.venv
|
||||||
/build
|
/build
|
||||||
dist
|
dist
|
||||||
|
vllm/*.so
|
||||||
|
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
.mypy_cache
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
/build/
|
||||||
|
cmake-build-*/
|
||||||
|
CMakeUserPresets.json
|
||||||
|
develop-eggs/
|
||||||
|
/dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|||||||
30
.github/CODEOWNERS
vendored
Normal file
30
.github/CODEOWNERS
vendored
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
# See https://help.github.com/articles/about-codeowners/
|
||||||
|
# for more info about CODEOWNERS file
|
||||||
|
|
||||||
|
# This lists cover the "core" components of vLLM that require careful review
|
||||||
|
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||||
|
/vllm/core @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||||
|
/vllm/engine/llm_engine.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||||
|
/vllm/executor/executor_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||||
|
/vllm/worker/worker_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||||
|
/vllm/worker/worker.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||||
|
/vllm/model_executor/layers/sampler.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||||
|
CMakeLists.txt @tlrmchlsmth @WoosukKwon
|
||||||
|
|
||||||
|
# Test ownership
|
||||||
|
/tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo
|
||||||
|
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||||
|
/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo
|
||||||
|
/tests/models @DarkLight1337 @ywang96
|
||||||
|
/tests/multimodal @DarkLight1337 @ywang96
|
||||||
|
/tests/prefix_caching @comaniac @KuntaiDu
|
||||||
|
/tests/spec_decode @njhill @LiuXiaoxuanPKU
|
||||||
|
/tests/kernels @tlrmchlsmth @WoosukKwon
|
||||||
|
/tests/quantization @mgoin @robertgshaw2-neuralmagic
|
||||||
|
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
||||||
|
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
||||||
|
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
||||||
|
/tests/distributed/test_same_node.py @youkaichao
|
||||||
|
/tests/multi_step @alexm-neuralmagic @comaniac
|
||||||
|
/tests/weight_loading @mgoin @youkaichao
|
||||||
|
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
|
||||||
7
.github/dependabot.yml
vendored
Normal file
7
.github/dependabot.yml
vendored
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
version: 2
|
||||||
|
updates:
|
||||||
|
# Maintain dependencies for GitHub Actions
|
||||||
|
- package-ecosystem: "github-actions"
|
||||||
|
directory: "/"
|
||||||
|
schedule:
|
||||||
|
interval: "weekly"
|
||||||
37
.github/workflows/actionlint.yml
vendored
Normal file
37
.github/workflows/actionlint.yml
vendored
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
name: Lint GitHub Actions workflows
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- "main"
|
||||||
|
paths:
|
||||||
|
- '.github/workflows/*.ya?ml'
|
||||||
|
- '.github/workflows/actionlint.*'
|
||||||
|
pull_request:
|
||||||
|
branches:
|
||||||
|
- "main"
|
||||||
|
paths:
|
||||||
|
- '.github/workflows/*.ya?ml'
|
||||||
|
- '.github/workflows/actionlint.*'
|
||||||
|
|
||||||
|
env:
|
||||||
|
LC_ALL: en_US.UTF-8
|
||||||
|
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
shell: bash
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
actionlint:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: "Checkout"
|
||||||
|
uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: "Run actionlint"
|
||||||
|
run: |
|
||||||
|
tools/actionlint.sh -color
|
||||||
2
.github/workflows/add_label_automerge.yml
vendored
2
.github/workflows/add_label_automerge.yml
vendored
@@ -8,7 +8,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Add label
|
- name: Add label
|
||||||
uses: actions/github-script@v5
|
uses: actions/github-script@v7
|
||||||
with:
|
with:
|
||||||
script: |
|
script: |
|
||||||
github.rest.issues.addLabels({
|
github.rest.issues.addLabels({
|
||||||
|
|||||||
4
.github/workflows/clang-format.yml
vendored
4
.github/workflows/clang-format.yml
vendored
@@ -17,9 +17,9 @@ jobs:
|
|||||||
matrix:
|
matrix:
|
||||||
python-version: ["3.11"]
|
python-version: ["3.11"]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v4
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
uses: actions/setup-python@v2
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
|
|||||||
17
.github/workflows/matchers/actionlint.json
vendored
Normal file
17
.github/workflows/matchers/actionlint.json
vendored
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
{
|
||||||
|
"problemMatcher": [
|
||||||
|
{
|
||||||
|
"owner": "actionlint",
|
||||||
|
"pattern": [
|
||||||
|
{
|
||||||
|
"regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$",
|
||||||
|
"file": 1,
|
||||||
|
"line": 2,
|
||||||
|
"column": 3,
|
||||||
|
"message": 4,
|
||||||
|
"code": 5
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
19
.github/workflows/mypy.yaml
vendored
19
.github/workflows/mypy.yaml
vendored
@@ -11,15 +11,15 @@ on:
|
|||||||
- main
|
- main
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
ruff:
|
mypy:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
|
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v4
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
uses: actions/setup-python@v2
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
@@ -32,15 +32,4 @@ jobs:
|
|||||||
pip install types-setuptools
|
pip install types-setuptools
|
||||||
- name: Mypy
|
- name: Mypy
|
||||||
run: |
|
run: |
|
||||||
mypy
|
tools/mypy.sh
|
||||||
mypy tests --follow-imports skip
|
|
||||||
mypy vllm/attention --follow-imports skip
|
|
||||||
mypy vllm/distributed --follow-imports skip
|
|
||||||
mypy vllm/engine --follow-imports skip
|
|
||||||
mypy vllm/executor --follow-imports skip
|
|
||||||
mypy vllm/lora --follow-imports skip
|
|
||||||
mypy vllm/model_executor --follow-imports skip
|
|
||||||
mypy vllm/prompt_adapter --follow-imports skip
|
|
||||||
mypy vllm/spec_decode --follow-imports skip
|
|
||||||
mypy vllm/worker --follow-imports skip
|
|
||||||
|
|
||||||
|
|||||||
16
.github/workflows/publish.yml
vendored
16
.github/workflows/publish.yml
vendored
@@ -21,16 +21,16 @@ jobs:
|
|||||||
upload_url: ${{ steps.create_release.outputs.upload_url }}
|
upload_url: ${{ steps.create_release.outputs.upload_url }}
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Extract branch info
|
- name: Extract branch info
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
|
echo "release_tag=${GITHUB_REF#refs/*/}" >> "$GITHUB_ENV"
|
||||||
|
|
||||||
- name: Create Release
|
- name: Create Release
|
||||||
id: create_release
|
id: create_release
|
||||||
uses: "actions/github-script@v6"
|
uses: "actions/github-script@v7"
|
||||||
env:
|
env:
|
||||||
RELEASE_TAG: ${{ env.release_tag }}
|
RELEASE_TAG: ${{ env.release_tag }}
|
||||||
with:
|
with:
|
||||||
@@ -54,7 +54,7 @@ jobs:
|
|||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Setup ccache
|
- name: Setup ccache
|
||||||
uses: hendrikmuhs/ccache-action@v1.2
|
uses: hendrikmuhs/ccache-action@v1.2
|
||||||
@@ -68,7 +68,7 @@ jobs:
|
|||||||
bash -x .github/workflows/scripts/env.sh
|
bash -x .github/workflows/scripts/env.sh
|
||||||
|
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@v4
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
|
|
||||||
@@ -86,10 +86,10 @@ jobs:
|
|||||||
CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
|
CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
|
||||||
run: |
|
run: |
|
||||||
bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
|
bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
|
||||||
wheel_name=$(ls dist/*whl | xargs -n 1 basename)
|
wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
|
||||||
asset_name=${wheel_name//"linux"/"manylinux1"}
|
asset_name=${wheel_name//"linux"/"manylinux1"}
|
||||||
echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
|
echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
|
||||||
echo "asset_name=${asset_name}" >> $GITHUB_ENV
|
echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
|
||||||
|
|
||||||
- name: Upload Release Asset
|
- name: Upload Release Asset
|
||||||
uses: actions/upload-release-asset@v1
|
uses: actions/upload-release-asset@v1
|
||||||
|
|||||||
2
.github/workflows/reminder_comment.yml
vendored
2
.github/workflows/reminder_comment.yml
vendored
@@ -8,7 +8,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Remind to run full CI on PR
|
- name: Remind to run full CI on PR
|
||||||
uses: actions/github-script@v6
|
uses: actions/github-script@v7
|
||||||
with:
|
with:
|
||||||
script: |
|
script: |
|
||||||
github.rest.issues.createComment({
|
github.rest.issues.createComment({
|
||||||
|
|||||||
4
.github/workflows/ruff.yml
vendored
4
.github/workflows/ruff.yml
vendored
@@ -17,9 +17,9 @@ jobs:
|
|||||||
matrix:
|
matrix:
|
||||||
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
|
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v4
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
uses: actions/setup-python@v2
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
|
|||||||
3
.github/workflows/scripts/build.sh
vendored
3
.github/workflows/scripts/build.sh
vendored
@@ -8,8 +8,7 @@ PATH=${cuda_home}/bin:$PATH
|
|||||||
LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
|
LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
|
||||||
|
|
||||||
# Install requirements
|
# Install requirements
|
||||||
$python_executable -m pip install wheel packaging
|
$python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt
|
||||||
$python_executable -m pip install -r requirements-cuda.txt
|
|
||||||
|
|
||||||
# Limit the number of parallel jobs to avoid OOM
|
# Limit the number of parallel jobs to avoid OOM
|
||||||
export MAX_JOBS=1
|
export MAX_JOBS=1
|
||||||
|
|||||||
4
.github/workflows/yapf.yml
vendored
4
.github/workflows/yapf.yml
vendored
@@ -16,9 +16,9 @@ jobs:
|
|||||||
matrix:
|
matrix:
|
||||||
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
|
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v4
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
uses: actions/setup-python@v2
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
|
|||||||
4
.gitignore
vendored
4
.gitignore
vendored
@@ -33,6 +33,7 @@ share/python-wheels/
|
|||||||
.installed.cfg
|
.installed.cfg
|
||||||
*.egg
|
*.egg
|
||||||
MANIFEST
|
MANIFEST
|
||||||
|
/.deps/
|
||||||
|
|
||||||
# PyInstaller
|
# PyInstaller
|
||||||
# Usually these files are written by a python script from a template
|
# Usually these files are written by a python script from a template
|
||||||
@@ -198,3 +199,6 @@ hip_compat.h
|
|||||||
|
|
||||||
# Benchmark dataset
|
# Benchmark dataset
|
||||||
benchmarks/*.json
|
benchmarks/*.json
|
||||||
|
|
||||||
|
# Linting
|
||||||
|
actionlint
|
||||||
|
|||||||
@@ -13,10 +13,10 @@ sphinx:
|
|||||||
fail_on_warning: true
|
fail_on_warning: true
|
||||||
|
|
||||||
# If using Sphinx, optionally build your docs in additional formats such as PDF
|
# If using Sphinx, optionally build your docs in additional formats such as PDF
|
||||||
formats:
|
formats: []
|
||||||
- pdf
|
|
||||||
|
|
||||||
# Optionally declare the Python requirements required to build your docs
|
# Optionally declare the Python requirements required to build your docs
|
||||||
python:
|
python:
|
||||||
install:
|
install:
|
||||||
- requirements: docs/requirements-docs.txt
|
- requirements: docs/requirements-docs.txt
|
||||||
|
|
||||||
|
|||||||
265
CMakeLists.txt
265
CMakeLists.txt
@@ -143,14 +143,32 @@ else()
|
|||||||
message(FATAL_ERROR "Can't find CUDA or HIP installation.")
|
message(FATAL_ERROR "Can't find CUDA or HIP installation.")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
#
|
|
||||||
# Override the GPU architectures detected by cmake/torch and filter them by
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
# the supported versions for the current language.
|
#
|
||||||
# The final set of arches is stored in `VLLM_GPU_ARCHES`.
|
# For cuda we want to be able to control which architectures we compile for on
|
||||||
#
|
# a per-file basis in order to cut down on compile time. So here we extract
|
||||||
override_gpu_arches(VLLM_GPU_ARCHES
|
# the set of architectures we want to compile for and remove the from the
|
||||||
${VLLM_GPU_LANG}
|
# CMAKE_CUDA_FLAGS so that they are not applied globally.
|
||||||
"${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
|
#
|
||||||
|
clear_cuda_arches(CUDA_ARCH_FLAGS)
|
||||||
|
extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}")
|
||||||
|
message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
|
||||||
|
# Filter the target architectures by the supported supported archs
|
||||||
|
# since for some files we will build for all CUDA_ARCHS.
|
||||||
|
cuda_archs_loose_intersection(CUDA_ARCHS
|
||||||
|
"${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
|
||||||
|
message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
|
||||||
|
else()
|
||||||
|
#
|
||||||
|
# For other GPU targets override the GPU architectures detected by cmake/torch
|
||||||
|
# and filter them by the supported versions for the current language.
|
||||||
|
# The final set of arches is stored in `VLLM_GPU_ARCHES`.
|
||||||
|
#
|
||||||
|
override_gpu_arches(VLLM_GPU_ARCHES
|
||||||
|
${VLLM_GPU_LANG}
|
||||||
|
"${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
|
||||||
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
# Query torch for additional GPU compilation flags for the given
|
# Query torch for additional GPU compilation flags for the given
|
||||||
@@ -166,7 +184,16 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
|
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
|
||||||
|
# Configure it to place files in vllm/.deps, in order to play nicely with sccache.
|
||||||
|
#
|
||||||
include(FetchContent)
|
include(FetchContent)
|
||||||
|
get_filename_component(PROJECT_ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
|
||||||
|
file(MAKE_DIRECTORY "${FETCHCONTENT_BASE_DIR}")
|
||||||
|
set(FETCHCONTENT_BASE_DIR "${PROJECT_ROOT_DIR}/.deps")
|
||||||
|
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Define other extension targets
|
# Define other extension targets
|
||||||
@@ -214,30 +241,89 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
|
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
|
||||||
"csrc/quantization/aqlm/gemm_kernels.cu"
|
"csrc/quantization/aqlm/gemm_kernels.cu"
|
||||||
"csrc/quantization/awq/gemm_kernels.cu"
|
"csrc/quantization/awq/gemm_kernels.cu"
|
||||||
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
|
|
||||||
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
|
||||||
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
|
|
||||||
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
|
|
||||||
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
|
|
||||||
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu"
|
|
||||||
"csrc/quantization/gguf/gguf_kernel.cu"
|
"csrc/quantization/gguf/gguf_kernel.cu"
|
||||||
"csrc/quantization/fp8/fp8_marlin.cu"
|
|
||||||
"csrc/custom_all_reduce.cu"
|
"csrc/custom_all_reduce.cu"
|
||||||
"csrc/permute_cols.cu"
|
"csrc/permute_cols.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu")
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
|
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${VLLM_EXT_SRC}"
|
||||||
|
CUDA_ARCHS "${CUDA_ARCHS}")
|
||||||
|
|
||||||
|
# Only build Marlin kernels if we are building for at least some compatible archs.
|
||||||
|
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
|
||||||
|
# are not supported by Machete yet.
|
||||||
|
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS})
|
||||||
|
if (MARLIN_ARCHS)
|
||||||
|
set(MARLIN_SRCS
|
||||||
|
"csrc/quantization/fp8/fp8_marlin.cu"
|
||||||
|
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
|
||||||
|
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
||||||
|
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
|
||||||
|
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
|
||||||
|
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
|
||||||
|
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${MARLIN_SRCS}"
|
||||||
|
CUDA_ARCHS "${MARLIN_ARCHS}")
|
||||||
|
list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
|
||||||
|
message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
|
||||||
|
else()
|
||||||
|
message(STATUS "Not building Marlin kernels as no compatible archs found"
|
||||||
|
"in CUDA target architectures")
|
||||||
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
# The CUTLASS kernels for Hopper require sm90a to be enabled.
|
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
|
||||||
# This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a.
|
# CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
|
||||||
# That adds an extra 17MB to compiled binary, so instead we selectively enable it.
|
cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
|
||||||
set_source_files_properties(
|
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
|
set_gencode_flags_for_srcs(
|
||||||
PROPERTIES
|
SRCS "${SRCS}"
|
||||||
COMPILE_FLAGS
|
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
|
||||||
"-gencode arch=compute_90a,code=sm_90a")
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||||
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
|
||||||
|
message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
|
||||||
|
else()
|
||||||
|
# clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
|
||||||
|
# build any 3x kernels
|
||||||
|
set(SCALED_MM_3X_ARCHS)
|
||||||
|
|
||||||
|
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
|
||||||
|
message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
|
||||||
|
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
|
||||||
|
"later if you intend on running FP8 quantized models on "
|
||||||
|
"Hopper.")
|
||||||
|
else()
|
||||||
|
message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
|
||||||
|
"in CUDA target architectures")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
#
|
||||||
|
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
|
||||||
|
# kernels for the remaining archs that are not already built for 3x.
|
||||||
|
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
|
||||||
|
"7.5;8.0;8.6;8.9;9.0;9.0a" "${CUDA_ARCHS}")
|
||||||
|
# subtract out the archs that are already built for 3x
|
||||||
|
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
||||||
|
if (SCALED_MM_2X_ARCHS)
|
||||||
|
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${SRCS}"
|
||||||
|
CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
|
||||||
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||||
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C2X=1")
|
||||||
|
message(STATUS "Building scaled_mm_c2x for archs: ${SCALED_MM_2X_ARCHS}")
|
||||||
|
else()
|
||||||
|
if (SCALED_MM_3X_ARCHS)
|
||||||
|
message(STATUS "Not building scaled_mm_c2x as all archs are already built"
|
||||||
|
" for and covered by scaled_mm_c3x")
|
||||||
|
else()
|
||||||
|
message(STATUS "Not building scaled_mm_c2x as no compatible archs found "
|
||||||
|
"in CUDA target architectures")
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
||||||
@@ -245,47 +331,72 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# Machete kernels
|
# Machete kernels
|
||||||
|
|
||||||
# The machete kernels only work on hopper and require CUDA 12.0 or later.
|
# The machete kernels only work on hopper and require CUDA 12.0 or later.
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
|
# Only build Machete kernels if we are building for something compatible with sm90a
|
||||||
|
cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||||
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
|
||||||
#
|
#
|
||||||
# For the Machete kernels we automatically generate sources for various
|
# For the Machete kernels we automatically generate sources for various
|
||||||
# preselected input type pairs and schedules.
|
# preselected input type pairs and schedules.
|
||||||
# Generate sources:
|
# Generate sources:
|
||||||
execute_process(
|
set(MACHETE_GEN_SCRIPT
|
||||||
COMMAND ${CMAKE_COMMAND} -E env
|
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py)
|
||||||
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
|
file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH)
|
||||||
${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py
|
|
||||||
RESULT_VARIABLE machete_generation_result
|
|
||||||
OUTPUT_VARIABLE machete_generation_output
|
|
||||||
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
|
|
||||||
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
|
|
||||||
)
|
|
||||||
|
|
||||||
if (NOT machete_generation_result EQUAL 0)
|
message(STATUS "Machete generation script hash: ${MACHETE_GEN_SCRIPT_HASH}")
|
||||||
message(FATAL_ERROR "Machete generation failed."
|
message(STATUS "Last run machete generate script hash: $CACHE{MACHETE_GEN_SCRIPT_HASH}")
|
||||||
" Result: \"${machete_generation_result}\""
|
|
||||||
"\nCheck the log for details: "
|
if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH}
|
||||||
"${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
|
OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
|
||||||
|
execute_process(
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E env
|
||||||
|
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
|
||||||
|
${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
|
||||||
|
RESULT_VARIABLE machete_generation_result
|
||||||
|
OUTPUT_VARIABLE machete_generation_output
|
||||||
|
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
|
||||||
|
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
|
||||||
|
)
|
||||||
|
|
||||||
|
if (NOT machete_generation_result EQUAL 0)
|
||||||
|
message(FATAL_ERROR "Machete generation failed."
|
||||||
|
" Result: \"${machete_generation_result}\""
|
||||||
|
"\nCheck the log for details: "
|
||||||
|
"${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
|
||||||
|
else()
|
||||||
|
set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
|
||||||
|
CACHE STRING "Last run machete generate script hash" FORCE)
|
||||||
|
message(STATUS "Machete generation completed successfully.")
|
||||||
|
endif()
|
||||||
else()
|
else()
|
||||||
message(STATUS "Machete generation completed successfully.")
|
message(STATUS "Machete generation script has not changed, skipping generation.")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Add machete generated sources
|
# Add machete generated sources
|
||||||
file(GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu")
|
file(GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu")
|
||||||
list(APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES})
|
list(APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES})
|
||||||
message(STATUS "Machete generated sources: ${MACHETE_GEN_SOURCES}")
|
|
||||||
|
|
||||||
set_source_files_properties(
|
# forward compatible
|
||||||
${MACHETE_GEN_SOURCES}
|
set_gencode_flags_for_srcs(
|
||||||
PROPERTIES
|
SRCS "${MACHETE_GEN_SOURCES}"
|
||||||
COMPILE_FLAGS
|
CUDA_ARCHS "${MACHETE_ARCHS}")
|
||||||
"-gencode arch=compute_90a,code=sm_90a")
|
|
||||||
|
list(APPEND VLLM_EXT_SRC
|
||||||
|
csrc/quantization/machete/machete_pytorch.cu)
|
||||||
|
|
||||||
|
message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
|
||||||
|
else()
|
||||||
|
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
|
||||||
|
AND MACHETE_ARCHS)
|
||||||
|
message(STATUS "Not building Machete kernels as CUDA Compiler version is "
|
||||||
|
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
|
||||||
|
"later if you intend on running w4a16 quantized models on "
|
||||||
|
"Hopper.")
|
||||||
|
else()
|
||||||
|
message(STATUS "Not building Machete kernels as no compatible archs "
|
||||||
|
"found in CUDA target architectures")
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
# if CUDA endif
|
||||||
# Add pytorch binding for machete (add on even CUDA < 12.0 so that we can
|
|
||||||
# raise an error if the user that this was built with an incompatible
|
|
||||||
# CUDA version)
|
|
||||||
list(APPEND VLLM_EXT_SRC
|
|
||||||
csrc/quantization/machete/machete_pytorch.cu)
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
message(STATUS "Enabling C extension.")
|
message(STATUS "Enabling C extension.")
|
||||||
@@ -314,14 +425,33 @@ set(VLLM_MOE_EXT_SRC
|
|||||||
"csrc/moe/torch_bindings.cpp"
|
"csrc/moe/torch_bindings.cpp"
|
||||||
"csrc/moe/topk_softmax_kernels.cu")
|
"csrc/moe/topk_softmax_kernels.cu")
|
||||||
|
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${VLLM_MOE_EXT_SRC}"
|
||||||
|
CUDA_ARCHS "${CUDA_ARCHS}")
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
list(APPEND VLLM_MOE_EXT_SRC
|
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
|
||||||
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
|
if (MARLIN_MOE_ARCHS)
|
||||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
|
set(MARLIN_MOE_SRC
|
||||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
|
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
|
||||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
|
||||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
|
||||||
"csrc/moe/marlin_moe_ops.cu")
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
|
||||||
|
"csrc/moe/marlin_moe_ops.cu")
|
||||||
|
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${MARLIN_MOE_SRC}"
|
||||||
|
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
|
||||||
|
|
||||||
|
list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
|
||||||
|
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
|
||||||
|
else()
|
||||||
|
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
|
||||||
|
"in CUDA target architectures")
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
message(STATUS "Enabling moe extension.")
|
message(STATUS "Enabling moe extension.")
|
||||||
@@ -359,6 +489,17 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
|
|||||||
return()
|
return()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
|
||||||
|
# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
|
||||||
|
# arches in the CUDA case (and instead set the gencodes on a per file basis)
|
||||||
|
# we need to manually set VLLM_GPU_ARCHES here.
|
||||||
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
|
foreach(_ARCH ${CUDA_ARCHS})
|
||||||
|
string(REPLACE "." "" _ARCH "${_ARCH}")
|
||||||
|
list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
|
||||||
|
endforeach()
|
||||||
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
# Build vLLM flash attention from source
|
# Build vLLM flash attention from source
|
||||||
#
|
#
|
||||||
|
|||||||
@@ -1,30 +1,23 @@
|
|||||||
# Contributing to vLLM
|
# Contributing to vLLM
|
||||||
|
|
||||||
Thank you for your interest in contributing to vLLM!
|
Thank you for your interest in contributing to vLLM! Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. There are several ways you can contribute to the project:
|
||||||
Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large.
|
|
||||||
There are several ways you can contribute to the project:
|
|
||||||
|
|
||||||
- Identify and report any issues or bugs.
|
- Identify and report any issues or bugs.
|
||||||
- Request or add a new model.
|
- Request or add support for a new model.
|
||||||
- Suggest or implement new features.
|
- Suggest or implement new features.
|
||||||
|
- Improve documentation or contribute a how-to guide.
|
||||||
|
|
||||||
However, remember that contributions aren't just about code.
|
We also believe in the power of community support; thus, answering queries, offering PR reviews, and assisting others are also highly regarded and beneficial contributions.
|
||||||
We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions.
|
|
||||||
|
|
||||||
Finally, one of the most impactful ways to support us is by raising awareness about vLLM.
|
Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository!
|
||||||
Talk about it in your blog posts, highlighting how it's driving your incredible projects.
|
|
||||||
Express your support on Twitter if vLLM aids you, or simply offer your appreciation by starring our repository.
|
|
||||||
|
|
||||||
|
|
||||||
## Setup for development
|
## Developing
|
||||||
|
|
||||||
### Build from source
|
Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the [building from source](https://docs.vllm.ai/en/latest/getting_started/installation.html#build-from-source) documentation for details.
|
||||||
|
|
||||||
```bash
|
|
||||||
pip install -e . # This may take several minutes.
|
|
||||||
```
|
|
||||||
|
|
||||||
### Testing
|
## Testing
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install -r requirements-dev.txt
|
pip install -r requirements-dev.txt
|
||||||
@@ -36,15 +29,16 @@ mypy
|
|||||||
# Unit tests
|
# Unit tests
|
||||||
pytest tests/
|
pytest tests/
|
||||||
```
|
```
|
||||||
**Note:** Currently, the repository does not pass the mypy tests.
|
**Note:** Currently, the repository does not pass the ``mypy`` tests.
|
||||||
|
|
||||||
|
## Contribution Guidelines
|
||||||
|
|
||||||
## Contributing Guidelines
|
### Issues
|
||||||
|
|
||||||
### Issue Reporting
|
If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
|
||||||
|
|
||||||
If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it.
|
> [!IMPORTANT]
|
||||||
If not, please file a new issue, providing as much relevant information as possible.
|
> If you discover a security vulnerability, please follow the instructions [here](/SECURITY.md#reporting-a-vulnerability).
|
||||||
|
|
||||||
### Pull Requests & Code Reviews
|
### Pull Requests & Code Reviews
|
||||||
|
|
||||||
@@ -53,4 +47,4 @@ Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE
|
|||||||
### Thank You
|
### Thank You
|
||||||
|
|
||||||
Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
|
Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
|
||||||
Your contributions make vLLM a great tool for everyone!
|
All of your contributions help make vLLM a great tool and community for everyone!
|
||||||
|
|||||||
20
Dockerfile
20
Dockerfile
@@ -27,6 +27,14 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
|||||||
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
||||||
&& python3 --version && python3 -m pip --version
|
&& python3 --version && python3 -m pip --version
|
||||||
|
|
||||||
|
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
|
||||||
|
# as it was causing spam when compiling the CUTLASS kernels
|
||||||
|
RUN apt-get install -y gcc-10 g++-10
|
||||||
|
RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
|
||||||
|
RUN <<EOF
|
||||||
|
gcc --version
|
||||||
|
EOF
|
||||||
|
|
||||||
# Workaround for https://github.com/openai/triton/issues/2507 and
|
# Workaround for https://github.com/openai/triton/issues/2507 and
|
||||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||||
# this won't be needed for future versions of this docker image
|
# this won't be needed for future versions of this docker image
|
||||||
@@ -63,14 +71,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
|||||||
python3 -m pip install -r requirements-build.txt
|
python3 -m pip install -r requirements-build.txt
|
||||||
|
|
||||||
# files and directories related to build wheels
|
# files and directories related to build wheels
|
||||||
COPY csrc csrc
|
COPY . .
|
||||||
COPY setup.py setup.py
|
|
||||||
COPY cmake cmake
|
|
||||||
COPY CMakeLists.txt CMakeLists.txt
|
|
||||||
COPY requirements-common.txt requirements-common.txt
|
|
||||||
COPY requirements-cuda.txt requirements-cuda.txt
|
|
||||||
COPY pyproject.toml pyproject.toml
|
|
||||||
COPY vllm vllm
|
|
||||||
|
|
||||||
# max jobs used by Ninja to build extensions
|
# max jobs used by Ninja to build extensions
|
||||||
ARG max_jobs=2
|
ARG max_jobs=2
|
||||||
@@ -135,7 +136,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
|||||||
#################### DEV IMAGE ####################
|
#################### DEV IMAGE ####################
|
||||||
#################### vLLM installation IMAGE ####################
|
#################### vLLM installation IMAGE ####################
|
||||||
# image with vLLM installed
|
# image with vLLM installed
|
||||||
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base
|
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
|
||||||
ARG CUDA_VERSION=12.4.1
|
ARG CUDA_VERSION=12.4.1
|
||||||
ARG PYTHON_VERSION=3.12
|
ARG PYTHON_VERSION=3.12
|
||||||
WORKDIR /vllm-workspace
|
WORKDIR /vllm-workspace
|
||||||
@@ -173,6 +174,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
|
|||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
. /etc/environment && \
|
. /etc/environment && \
|
||||||
python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
|
python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
|
||||||
|
COPY examples examples
|
||||||
#################### vLLM installation IMAGE ####################
|
#################### vLLM installation IMAGE ####################
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -22,11 +22,12 @@ ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/li
|
|||||||
|
|
||||||
RUN echo 'ulimit -c 0' >> ~/.bashrc
|
RUN echo 'ulimit -c 0' >> ~/.bashrc
|
||||||
|
|
||||||
RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl
|
RUN pip install intel_extension_for_pytorch==2.4.0
|
||||||
|
|
||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
ENV PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu
|
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
||||||
|
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
--mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
|
--mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
|
||||||
pip install --upgrade pip && \
|
pip install --upgrade pip && \
|
||||||
|
|||||||
@@ -9,16 +9,7 @@ RUN apt-get update -y && \
|
|||||||
ffmpeg libsm6 libxext6 libgl1
|
ffmpeg libsm6 libxext6 libgl1
|
||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
# copy requirements
|
COPY . .
|
||||||
COPY requirements-build.txt /workspace/vllm/
|
|
||||||
COPY requirements-common.txt /workspace/vllm/
|
|
||||||
COPY requirements-openvino.txt /workspace/vllm/
|
|
||||||
|
|
||||||
COPY vllm/ /workspace/vllm/vllm
|
|
||||||
COPY csrc/core /workspace/vllm/csrc/core
|
|
||||||
COPY cmake/utils.cmake /workspace/vllm/cmake/
|
|
||||||
COPY CMakeLists.txt /workspace/vllm/
|
|
||||||
COPY setup.py /workspace/vllm/
|
|
||||||
|
|
||||||
# install build requirements
|
# install build requirements
|
||||||
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
|
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04
|
FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS vllm-base
|
||||||
|
|
||||||
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
|
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
|
||||||
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
|
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
|
||||||
@@ -7,20 +7,49 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
|
|||||||
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
|
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
|
||||||
chmod 644 /usr/share/keyrings/intel-graphics.gpg
|
chmod 644 /usr/share/keyrings/intel-graphics.gpg
|
||||||
|
|
||||||
RUN apt-get update -y && \
|
RUN apt-get update -y && \
|
||||||
apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1
|
apt-get install -y --no-install-recommends --fix-missing \
|
||||||
|
curl \
|
||||||
|
ffmpeg \
|
||||||
|
git \
|
||||||
|
libsndfile1 \
|
||||||
|
libsm6 \
|
||||||
|
libxext6 \
|
||||||
|
libgl1 \
|
||||||
|
lsb-release \
|
||||||
|
numactl \
|
||||||
|
python3 \
|
||||||
|
python3-dev \
|
||||||
|
python3-pip \
|
||||||
|
# vim \
|
||||||
|
wget
|
||||||
|
|
||||||
|
WORKDIR /workspace/vllm
|
||||||
|
COPY requirements-xpu.txt /workspace/vllm/requirements-xpu.txt
|
||||||
|
COPY requirements-common.txt /workspace/vllm/requirements-common.txt
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install --no-cache-dir \
|
||||||
|
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
|
||||||
|
-r requirements-xpu.txt
|
||||||
|
|
||||||
COPY ./ /workspace/vllm
|
COPY ./ /workspace/vllm
|
||||||
|
|
||||||
WORKDIR /workspace/vllm
|
ENV VLLM_TARGET_DEVICE=xpu
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install -v --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
|
|
||||||
cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
|
|
||||||
-r requirements-xpu.txt
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
--mount=type=bind,source=.git,target=.git \
|
--mount=type=bind,source=.git,target=.git \
|
||||||
VLLM_TARGET_DEVICE=xpu python3 setup.py install
|
python3 setup.py install
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
CMD ["/bin/bash"]
|
||||||
|
|
||||||
|
FROM vllm-base AS vllm-openai
|
||||||
|
|
||||||
|
# install additional dependencies for openai api server
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install accelerate hf_transfer 'modelscope!=1.15.0'
|
||||||
|
|
||||||
|
ENV VLLM_USAGE_SOURCE production-docker-image \
|
||||||
|
TRITON_XPU_PROFILE 1
|
||||||
|
|
||||||
|
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
|
|||||||
17
README.md
17
README.md
@@ -10,22 +10,13 @@ Easy, fast, and cheap LLM serving for everyone
|
|||||||
</h3>
|
</h3>
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> |
|
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
|
||||||
|
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
**vLLM, AMD, Anyscale Meet & Greet at [Ray Summit 2024](http://raysummit.anyscale.com) (Monday, Sept 30th, 5-7pm PT) at Marriott Marquis San Francisco**
|
|
||||||
|
|
||||||
We are excited to announce our special vLLM event in collaboration with AMD and Anyscale.
|
|
||||||
Join us to learn more about recent advancements of vLLM on MI300X.
|
|
||||||
Register [here](https://lu.ma/db5ld9n5) and be a part of the event!
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
*Latest News* 🔥
|
*Latest News* 🔥
|
||||||
|
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
|
||||||
|
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/sessioncatalog?tab.day=20241001&search.sessiontracks=1719251906298001uzJ2) from other vLLM contributors and users!
|
||||||
- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
|
- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
|
||||||
- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
|
- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
|
||||||
- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
|
- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
|
||||||
@@ -51,7 +42,7 @@ vLLM is fast with:
|
|||||||
- Speculative decoding
|
- Speculative decoding
|
||||||
- Chunked prefill
|
- Chunked prefill
|
||||||
|
|
||||||
**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/4068) that compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)).
|
**Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script.
|
||||||
|
|
||||||
vLLM is flexible and easy to use with:
|
vLLM is flexible and easy to use with:
|
||||||
|
|
||||||
|
|||||||
@@ -2,11 +2,10 @@
|
|||||||
|
|
||||||
## Reporting a Vulnerability
|
## Reporting a Vulnerability
|
||||||
|
|
||||||
If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away.
|
If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
|
||||||
We will investigate all legitimate reports and do our best to quickly fix the problem.
|
|
||||||
|
|
||||||
Please report security issues using https://github.com/vllm-project/vllm/security/advisories/new
|
Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new).
|
||||||
|
|
||||||
---
|
---
|
||||||
Please see PyTorch Security for more information how to securely interact with models: https://github.com/pytorch/pytorch/blob/main/SECURITY.md
|
|
||||||
This document mostly references the recommendation from PyTorch, thank you!
|
Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
|
||||||
|
|||||||
@@ -23,9 +23,9 @@ class RequestFuncInput:
|
|||||||
output_len: int
|
output_len: int
|
||||||
model: str
|
model: str
|
||||||
best_of: int = 1
|
best_of: int = 1
|
||||||
use_beam_search: bool = False
|
|
||||||
logprobs: Optional[int] = None
|
logprobs: Optional[int] = None
|
||||||
multi_modal_content: Optional[dict] = None
|
multi_modal_content: Optional[dict] = None
|
||||||
|
ignore_eos: bool = False
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -48,13 +48,13 @@ async def async_request_tgi(
|
|||||||
assert api_url.endswith("generate_stream")
|
assert api_url.endswith("generate_stream")
|
||||||
|
|
||||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
assert not request_func_input.use_beam_search
|
|
||||||
params = {
|
params = {
|
||||||
"best_of": request_func_input.best_of,
|
"best_of": request_func_input.best_of,
|
||||||
"max_new_tokens": request_func_input.output_len,
|
"max_new_tokens": request_func_input.output_len,
|
||||||
"do_sample": True,
|
"do_sample": True,
|
||||||
"temperature": 0.01, # TGI does not accept 0.0 temperature.
|
"temperature": 0.01, # TGI does not accept 0.0 temperature.
|
||||||
"top_p": 0.99, # TGI does not accept 1.0 top_p.
|
"top_p": 0.99, # TGI does not accept 1.0 top_p.
|
||||||
|
# TGI does not accept ignore_eos flag.
|
||||||
}
|
}
|
||||||
payload = {
|
payload = {
|
||||||
"inputs": request_func_input.prompt,
|
"inputs": request_func_input.prompt,
|
||||||
@@ -119,7 +119,6 @@ async def async_request_trt_llm(
|
|||||||
assert api_url.endswith("generate_stream")
|
assert api_url.endswith("generate_stream")
|
||||||
|
|
||||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
assert not request_func_input.use_beam_search
|
|
||||||
assert request_func_input.best_of == 1
|
assert request_func_input.best_of == 1
|
||||||
payload = {
|
payload = {
|
||||||
"accumulate_tokens": True,
|
"accumulate_tokens": True,
|
||||||
@@ -129,6 +128,8 @@ async def async_request_trt_llm(
|
|||||||
"max_tokens": request_func_input.output_len,
|
"max_tokens": request_func_input.output_len,
|
||||||
"stream": True,
|
"stream": True,
|
||||||
}
|
}
|
||||||
|
if request_func_input.ignore_eos:
|
||||||
|
payload["min_length"] = request_func_input.output_len
|
||||||
output = RequestFuncOutput()
|
output = RequestFuncOutput()
|
||||||
output.prompt_len = request_func_input.prompt_len
|
output.prompt_len = request_func_input.prompt_len
|
||||||
|
|
||||||
@@ -183,7 +184,6 @@ async def async_request_deepspeed_mii(
|
|||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
assert request_func_input.best_of == 1
|
assert request_func_input.best_of == 1
|
||||||
assert not request_func_input.use_beam_search
|
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
"prompt": request_func_input.prompt,
|
"prompt": request_func_input.prompt,
|
||||||
@@ -231,7 +231,6 @@ async def async_request_openai_completions(
|
|||||||
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
|
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
|
||||||
|
|
||||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
assert not request_func_input.use_beam_search
|
|
||||||
payload = {
|
payload = {
|
||||||
"model": request_func_input.model,
|
"model": request_func_input.model,
|
||||||
"prompt": request_func_input.prompt,
|
"prompt": request_func_input.prompt,
|
||||||
@@ -240,6 +239,7 @@ async def async_request_openai_completions(
|
|||||||
"max_tokens": request_func_input.output_len,
|
"max_tokens": request_func_input.output_len,
|
||||||
"logprobs": request_func_input.logprobs,
|
"logprobs": request_func_input.logprobs,
|
||||||
"stream": True,
|
"stream": True,
|
||||||
|
"ignore_eos": request_func_input.ignore_eos,
|
||||||
}
|
}
|
||||||
headers = {
|
headers = {
|
||||||
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
|
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
|
||||||
@@ -312,7 +312,6 @@ async def async_request_openai_chat_completions(
|
|||||||
), "OpenAI Chat Completions API URL must end with 'chat/completions'."
|
), "OpenAI Chat Completions API URL must end with 'chat/completions'."
|
||||||
|
|
||||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
assert not request_func_input.use_beam_search
|
|
||||||
content = [{"type": "text", "text": request_func_input.prompt}]
|
content = [{"type": "text", "text": request_func_input.prompt}]
|
||||||
if request_func_input.multi_modal_content:
|
if request_func_input.multi_modal_content:
|
||||||
content.append(request_func_input.multi_modal_content)
|
content.append(request_func_input.multi_modal_content)
|
||||||
@@ -327,6 +326,7 @@ async def async_request_openai_chat_completions(
|
|||||||
"temperature": 0.0,
|
"temperature": 0.0,
|
||||||
"max_tokens": request_func_input.output_len,
|
"max_tokens": request_func_input.output_len,
|
||||||
"stream": True,
|
"stream": True,
|
||||||
|
"ignore_eos": request_func_input.ignore_eos,
|
||||||
}
|
}
|
||||||
headers = {
|
headers = {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
@@ -430,4 +430,5 @@ ASYNC_REQUEST_FUNCS = {
|
|||||||
"openai-chat": async_request_openai_chat_completions,
|
"openai-chat": async_request_openai_chat_completions,
|
||||||
"tensorrt-llm": async_request_trt_llm,
|
"tensorrt-llm": async_request_trt_llm,
|
||||||
"scalellm": async_request_openai_completions,
|
"scalellm": async_request_openai_completions,
|
||||||
|
"sglang": async_request_openai_completions,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ from tqdm import tqdm
|
|||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.engine.arg_utils import DEVICE_OPTIONS, EngineArgs
|
from vllm.engine.arg_utils import DEVICE_OPTIONS, EngineArgs
|
||||||
from vllm.inputs import PromptInputs
|
from vllm.inputs import PromptType
|
||||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
@@ -51,9 +51,8 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(
|
||||||
n=args.n,
|
n=args.n,
|
||||||
temperature=0.0 if args.use_beam_search else 1.0,
|
temperature=1.0,
|
||||||
top_p=1.0,
|
top_p=1.0,
|
||||||
use_beam_search=args.use_beam_search,
|
|
||||||
ignore_eos=True,
|
ignore_eos=True,
|
||||||
max_tokens=args.output_len,
|
max_tokens=args.output_len,
|
||||||
)
|
)
|
||||||
@@ -61,7 +60,7 @@ def main(args: argparse.Namespace):
|
|||||||
dummy_prompt_token_ids = np.random.randint(10000,
|
dummy_prompt_token_ids = np.random.randint(10000,
|
||||||
size=(args.batch_size,
|
size=(args.batch_size,
|
||||||
args.input_len))
|
args.input_len))
|
||||||
dummy_inputs: List[PromptInputs] = [{
|
dummy_prompts: List[PromptType] = [{
|
||||||
"prompt_token_ids": batch
|
"prompt_token_ids": batch
|
||||||
} for batch in dummy_prompt_token_ids.tolist()]
|
} for batch in dummy_prompt_token_ids.tolist()]
|
||||||
|
|
||||||
@@ -74,13 +73,13 @@ def main(args: argparse.Namespace):
|
|||||||
],
|
],
|
||||||
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
||||||
str(profile_dir))) as p:
|
str(profile_dir))) as p:
|
||||||
llm.generate(dummy_inputs,
|
llm.generate(dummy_prompts,
|
||||||
sampling_params=sampling_params,
|
sampling_params=sampling_params,
|
||||||
use_tqdm=False)
|
use_tqdm=False)
|
||||||
print(p.key_averages())
|
print(p.key_averages())
|
||||||
else:
|
else:
|
||||||
start_time = time.perf_counter()
|
start_time = time.perf_counter()
|
||||||
llm.generate(dummy_inputs,
|
llm.generate(dummy_prompts,
|
||||||
sampling_params=sampling_params,
|
sampling_params=sampling_params,
|
||||||
use_tqdm=False)
|
use_tqdm=False)
|
||||||
end_time = time.perf_counter()
|
end_time = time.perf_counter()
|
||||||
@@ -222,7 +221,9 @@ if __name__ == '__main__':
|
|||||||
parser.add_argument("--enable-prefix-caching",
|
parser.add_argument("--enable-prefix-caching",
|
||||||
action='store_true',
|
action='store_true',
|
||||||
help="Enable automatic prefix caching")
|
help="Enable automatic prefix caching")
|
||||||
parser.add_argument('--use-v2-block-manager', action='store_true')
|
parser.add_argument('--use-v2-block-manager',
|
||||||
|
action='store_true',
|
||||||
|
default=EngineArgs.use_v2_block_manager)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--ray-workers-use-nsight",
|
"--ray-workers-use-nsight",
|
||||||
action='store_true',
|
action='store_true',
|
||||||
|
|||||||
@@ -33,6 +33,7 @@ from typing import List, Optional, Tuple
|
|||||||
from transformers import PreTrainedTokenizerBase
|
from transformers import PreTrainedTokenizerBase
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -113,7 +114,7 @@ def repeat_and_sort_requests(requests: List[Tuple[str, int, int]],
|
|||||||
def main(args):
|
def main(args):
|
||||||
tokenizer = get_tokenizer(args.model, trust_remote_code=True)
|
tokenizer = get_tokenizer(args.model, trust_remote_code=True)
|
||||||
input_length_range = tuple(map(int, args.input_length_range.split(':')))
|
input_length_range = tuple(map(int, args.input_length_range.split(':')))
|
||||||
|
random.seed(args.seed)
|
||||||
if args.dataset_path is not None:
|
if args.dataset_path is not None:
|
||||||
print(f"Start to sample {args.num_prompts} prompts"
|
print(f"Start to sample {args.num_prompts} prompts"
|
||||||
"from {args.dataset_path}")
|
"from {args.dataset_path}")
|
||||||
@@ -177,6 +178,7 @@ if __name__ == "__main__":
|
|||||||
help='enable prefix caching')
|
help='enable prefix caching')
|
||||||
parser.add_argument('--use-v2-block-manager',
|
parser.add_argument('--use-v2-block-manager',
|
||||||
action='store_true',
|
action='store_true',
|
||||||
|
default=EngineArgs.use_v2_block_manager,
|
||||||
help='Use BlockSpaceMangerV2')
|
help='Use BlockSpaceMangerV2')
|
||||||
parser.add_argument('--num-prompts',
|
parser.add_argument('--num-prompts',
|
||||||
type=int,
|
type=int,
|
||||||
@@ -194,5 +196,9 @@ if __name__ == "__main__":
|
|||||||
default='128:256',
|
default='128:256',
|
||||||
help='Range of input lengths for sampling prompts,'
|
help='Range of input lengths for sampling prompts,'
|
||||||
'specified as "min:max" (e.g., "128:256").')
|
'specified as "min:max" (e.g., "128:256").')
|
||||||
|
parser.add_argument("--seed",
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help='Random seed for reproducibility')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@@ -68,7 +68,6 @@ def run_vllm(
|
|||||||
tensor_parallel_size: int,
|
tensor_parallel_size: int,
|
||||||
seed: int,
|
seed: int,
|
||||||
n: int,
|
n: int,
|
||||||
use_beam_search: bool,
|
|
||||||
trust_remote_code: bool,
|
trust_remote_code: bool,
|
||||||
dtype: str,
|
dtype: str,
|
||||||
max_model_len: Optional[int],
|
max_model_len: Optional[int],
|
||||||
@@ -114,9 +113,8 @@ def run_vllm(
|
|||||||
sampling_params.append(
|
sampling_params.append(
|
||||||
SamplingParams(
|
SamplingParams(
|
||||||
n=n,
|
n=n,
|
||||||
temperature=0.0 if use_beam_search else 1.0,
|
temperature=1.0,
|
||||||
top_p=1.0,
|
top_p=1.0,
|
||||||
use_beam_search=use_beam_search,
|
|
||||||
ignore_eos=True,
|
ignore_eos=True,
|
||||||
max_tokens=output_len,
|
max_tokens=output_len,
|
||||||
))
|
))
|
||||||
@@ -144,15 +142,16 @@ def main(args: argparse.Namespace):
|
|||||||
args.output_len)
|
args.output_len)
|
||||||
|
|
||||||
if args.backend == "vllm":
|
if args.backend == "vllm":
|
||||||
elapsed_time = run_vllm(
|
elapsed_time = run_vllm(requests, args.model, args.tokenizer,
|
||||||
requests, args.model, args.tokenizer, args.quantization,
|
args.quantization, args.tensor_parallel_size,
|
||||||
args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
|
args.seed, args.n, args.trust_remote_code,
|
||||||
args.trust_remote_code, args.dtype, args.max_model_len,
|
args.dtype, args.max_model_len,
|
||||||
args.enforce_eager, args.kv_cache_dtype,
|
args.enforce_eager, args.kv_cache_dtype,
|
||||||
args.quantization_param_path, args.device,
|
args.quantization_param_path, args.device,
|
||||||
args.enable_prefix_caching, args.enable_chunked_prefill,
|
args.enable_prefix_caching,
|
||||||
args.max_num_batched_tokens, args.gpu_memory_utilization,
|
args.enable_chunked_prefill,
|
||||||
args.download_dir)
|
args.max_num_batched_tokens,
|
||||||
|
args.gpu_memory_utilization, args.download_dir)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown backend: {args.backend}")
|
raise ValueError(f"Unknown backend: {args.backend}")
|
||||||
total_num_tokens = sum(prompt_len + output_len
|
total_num_tokens = sum(prompt_len + output_len
|
||||||
@@ -203,7 +202,6 @@ if __name__ == "__main__":
|
|||||||
type=int,
|
type=int,
|
||||||
default=1,
|
default=1,
|
||||||
help="Number of generated sequences per prompt.")
|
help="Number of generated sequences per prompt.")
|
||||||
parser.add_argument("--use-beam-search", action="store_true")
|
|
||||||
parser.add_argument("--num-prompts",
|
parser.add_argument("--num-prompts",
|
||||||
type=int,
|
type=int,
|
||||||
default=200,
|
default=200,
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
"""Benchmark online serving throughput.
|
r"""Benchmark online serving throughput.
|
||||||
|
|
||||||
On the server side, run one of the following commands:
|
On the server side, run one of the following commands:
|
||||||
vLLM OpenAI API server
|
vLLM OpenAI API server
|
||||||
@@ -89,10 +89,8 @@ def sample_sharegpt_requests(
|
|||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
fixed_output_len: Optional[int] = None,
|
fixed_output_len: Optional[int] = None,
|
||||||
) -> List[Tuple[str, int, int, None]]:
|
) -> List[Tuple[str, int, int, None]]:
|
||||||
if fixed_output_len is not None and fixed_output_len < 4:
|
|
||||||
raise ValueError("output_len too small")
|
|
||||||
# Load the dataset.
|
# Load the dataset.
|
||||||
with open(dataset_path) as f:
|
with open(dataset_path, encoding='utf-8') as f:
|
||||||
dataset = json.load(f)
|
dataset = json.load(f)
|
||||||
# Filter out the conversations with less than 2 turns.
|
# Filter out the conversations with less than 2 turns.
|
||||||
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
|
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
|
||||||
@@ -117,7 +115,7 @@ def sample_sharegpt_requests(
|
|||||||
prompt_len = len(prompt_token_ids)
|
prompt_len = len(prompt_token_ids)
|
||||||
output_len = len(completion_token_ids
|
output_len = len(completion_token_ids
|
||||||
) if fixed_output_len is None else fixed_output_len
|
) if fixed_output_len is None else fixed_output_len
|
||||||
if prompt_len < 4 or output_len < 4:
|
if prompt_len < 4 or (fixed_output_len is None and output_len < 4):
|
||||||
# Prune too short sequences.
|
# Prune too short sequences.
|
||||||
continue
|
continue
|
||||||
if prompt_len > 1024 or prompt_len + output_len > 2048:
|
if prompt_len > 1024 or prompt_len + output_len > 2048:
|
||||||
@@ -141,7 +139,7 @@ def sample_sonnet_requests(
|
|||||||
), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."
|
), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."
|
||||||
|
|
||||||
# Load the dataset.
|
# Load the dataset.
|
||||||
with open(dataset_path) as f:
|
with open(dataset_path, encoding='utf-8') as f:
|
||||||
poem_lines = f.readlines()
|
poem_lines = f.readlines()
|
||||||
|
|
||||||
# Tokenize the poem lines.
|
# Tokenize the poem lines.
|
||||||
@@ -178,9 +176,9 @@ def sample_sonnet_requests(
|
|||||||
# Sample the rest of lines per request.
|
# Sample the rest of lines per request.
|
||||||
sampled_requests: List[Tuple[str, int, int]] = []
|
sampled_requests: List[Tuple[str, int, int]] = []
|
||||||
for _ in range(num_requests):
|
for _ in range(num_requests):
|
||||||
sampled_lines = "".join(
|
num_lines_needed = num_input_lines - num_prefix_lines
|
||||||
prefix_lines +
|
sampled_lines = "".join(prefix_lines +
|
||||||
random.sample(poem_lines, num_input_lines - num_prefix_lines))
|
random.choices(poem_lines, k=num_lines_needed))
|
||||||
|
|
||||||
prompt = f"{base_prompt}{sampled_lines}"
|
prompt = f"{base_prompt}{sampled_lines}"
|
||||||
message = [
|
message = [
|
||||||
@@ -228,10 +226,11 @@ def sample_hf_requests(
|
|||||||
prompt_len = len(prompt_token_ids)
|
prompt_len = len(prompt_token_ids)
|
||||||
output_len = len(completion_token_ids
|
output_len = len(completion_token_ids
|
||||||
) if fixed_output_len is None else fixed_output_len
|
) if fixed_output_len is None else fixed_output_len
|
||||||
if prompt_len < 4 or output_len < 4:
|
if fixed_output_len is None and (prompt_len < 4 or output_len < 4):
|
||||||
# Prune too short sequences.
|
# Prune too short sequences.
|
||||||
continue
|
continue
|
||||||
if prompt_len > 1024 or prompt_len + output_len > 2048:
|
if fixed_output_len is None and \
|
||||||
|
(prompt_len > 1024 or prompt_len + output_len > 2048):
|
||||||
# Prune too long sequences.
|
# Prune too long sequences.
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -392,12 +391,12 @@ async def benchmark(
|
|||||||
input_requests: List[Tuple[str, int, int]],
|
input_requests: List[Tuple[str, int, int]],
|
||||||
logprobs: Optional[int],
|
logprobs: Optional[int],
|
||||||
best_of: int,
|
best_of: int,
|
||||||
use_beam_search: bool,
|
|
||||||
request_rate: float,
|
request_rate: float,
|
||||||
disable_tqdm: bool,
|
disable_tqdm: bool,
|
||||||
profile: bool,
|
profile: bool,
|
||||||
selected_percentile_metrics: List[str],
|
selected_percentile_metrics: List[str],
|
||||||
selected_percentiles: List[str],
|
selected_percentiles: List[str],
|
||||||
|
ignore_eos: bool,
|
||||||
):
|
):
|
||||||
if backend in ASYNC_REQUEST_FUNCS:
|
if backend in ASYNC_REQUEST_FUNCS:
|
||||||
request_func = ASYNC_REQUEST_FUNCS[backend]
|
request_func = ASYNC_REQUEST_FUNCS[backend]
|
||||||
@@ -419,8 +418,8 @@ async def benchmark(
|
|||||||
output_len=test_output_len,
|
output_len=test_output_len,
|
||||||
logprobs=logprobs,
|
logprobs=logprobs,
|
||||||
best_of=best_of,
|
best_of=best_of,
|
||||||
use_beam_search=use_beam_search,
|
|
||||||
multi_modal_content=test_mm_content,
|
multi_modal_content=test_mm_content,
|
||||||
|
ignore_eos=ignore_eos,
|
||||||
)
|
)
|
||||||
test_output = await request_func(request_func_input=test_input)
|
test_output = await request_func(request_func_input=test_input)
|
||||||
if not test_output.success:
|
if not test_output.success:
|
||||||
@@ -440,7 +439,6 @@ async def benchmark(
|
|||||||
output_len=test_output_len,
|
output_len=test_output_len,
|
||||||
logprobs=logprobs,
|
logprobs=logprobs,
|
||||||
best_of=best_of,
|
best_of=best_of,
|
||||||
use_beam_search=use_beam_search,
|
|
||||||
multi_modal_content=test_mm_content,
|
multi_modal_content=test_mm_content,
|
||||||
)
|
)
|
||||||
profile_output = await request_func(request_func_input=profile_input)
|
profile_output = await request_func(request_func_input=profile_input)
|
||||||
@@ -463,7 +461,6 @@ async def benchmark(
|
|||||||
output_len=output_len,
|
output_len=output_len,
|
||||||
logprobs=logprobs,
|
logprobs=logprobs,
|
||||||
best_of=best_of,
|
best_of=best_of,
|
||||||
use_beam_search=use_beam_search,
|
|
||||||
multi_modal_content=mm_content,
|
multi_modal_content=mm_content,
|
||||||
)
|
)
|
||||||
tasks.append(
|
tasks.append(
|
||||||
@@ -482,7 +479,6 @@ async def benchmark(
|
|||||||
output_len=test_output_len,
|
output_len=test_output_len,
|
||||||
logprobs=logprobs,
|
logprobs=logprobs,
|
||||||
best_of=best_of,
|
best_of=best_of,
|
||||||
use_beam_search=use_beam_search,
|
|
||||||
)
|
)
|
||||||
profile_output = await request_func(request_func_input=profile_input)
|
profile_output = await request_func(request_func_input=profile_input)
|
||||||
if profile_output.success:
|
if profile_output.success:
|
||||||
@@ -540,7 +536,7 @@ async def benchmark(
|
|||||||
# E.g., "Time to First Token"
|
# E.g., "Time to First Token"
|
||||||
metric_header: str,
|
metric_header: str,
|
||||||
):
|
):
|
||||||
# This function print and add statistics of the specified
|
# This function prints and adds statistics of the specified
|
||||||
# metric.
|
# metric.
|
||||||
if metric_attribute_name not in selected_percentile_metrics:
|
if metric_attribute_name not in selected_percentile_metrics:
|
||||||
return
|
return
|
||||||
@@ -678,7 +674,6 @@ def main(args: argparse.Namespace):
|
|||||||
input_requests=input_requests,
|
input_requests=input_requests,
|
||||||
logprobs=args.logprobs,
|
logprobs=args.logprobs,
|
||||||
best_of=args.best_of,
|
best_of=args.best_of,
|
||||||
use_beam_search=args.use_beam_search,
|
|
||||||
request_rate=args.request_rate,
|
request_rate=args.request_rate,
|
||||||
disable_tqdm=args.disable_tqdm,
|
disable_tqdm=args.disable_tqdm,
|
||||||
profile=args.profile,
|
profile=args.profile,
|
||||||
@@ -686,6 +681,7 @@ def main(args: argparse.Namespace):
|
|||||||
selected_percentiles=[
|
selected_percentiles=[
|
||||||
float(p) for p in args.metric_percentiles.split(",")
|
float(p) for p in args.metric_percentiles.split(",")
|
||||||
],
|
],
|
||||||
|
ignore_eos=args.ignore_eos,
|
||||||
))
|
))
|
||||||
|
|
||||||
# Save config and results to json
|
# Save config and results to json
|
||||||
@@ -699,7 +695,6 @@ def main(args: argparse.Namespace):
|
|||||||
result_json["model_id"] = model_id
|
result_json["model_id"] = model_id
|
||||||
result_json["tokenizer_id"] = tokenizer_id
|
result_json["tokenizer_id"] = tokenizer_id
|
||||||
result_json["best_of"] = args.best_of
|
result_json["best_of"] = args.best_of
|
||||||
result_json["use_beam_search"] = args.use_beam_search
|
|
||||||
result_json["num_prompts"] = args.num_prompts
|
result_json["num_prompts"] = args.num_prompts
|
||||||
|
|
||||||
# Metadata
|
# Metadata
|
||||||
@@ -727,7 +722,7 @@ def main(args: argparse.Namespace):
|
|||||||
file_name = args.result_filename
|
file_name = args.result_filename
|
||||||
if args.result_dir:
|
if args.result_dir:
|
||||||
file_name = os.path.join(args.result_dir, file_name)
|
file_name = os.path.join(args.result_dir, file_name)
|
||||||
with open(file_name, "w") as outfile:
|
with open(file_name, "w", encoding='utf-8') as outfile:
|
||||||
json.dump(result_json, outfile)
|
json.dump(result_json, outfile)
|
||||||
|
|
||||||
|
|
||||||
@@ -864,6 +859,11 @@ if __name__ == "__main__":
|
|||||||
"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
|
"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
|
||||||
" format.",
|
" format.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--ignore-eos",
|
||||||
|
action="store_true",
|
||||||
|
help="Set ignore_eos flag when sending the benchmark request."
|
||||||
|
"Warning: ignore_eos is not supported in deepspeed_mii and tgi.")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--percentile-metrics",
|
"--percentile-metrics",
|
||||||
type=str,
|
type=str,
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ from vllm.engine.arg_utils import DEVICE_OPTIONS, AsyncEngineArgs, EngineArgs
|
|||||||
from vllm.entrypoints.openai.api_server import (
|
from vllm.entrypoints.openai.api_server import (
|
||||||
build_async_engine_client_from_engine_args)
|
build_async_engine_client_from_engine_args)
|
||||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||||
|
from vllm.sampling_params import BeamSearchParams
|
||||||
from vllm.utils import FlexibleArgumentParser, merge_async_iterators
|
from vllm.utils import FlexibleArgumentParser, merge_async_iterators
|
||||||
|
|
||||||
|
|
||||||
@@ -72,7 +73,6 @@ def run_vllm(
|
|||||||
tensor_parallel_size: int,
|
tensor_parallel_size: int,
|
||||||
seed: int,
|
seed: int,
|
||||||
n: int,
|
n: int,
|
||||||
use_beam_search: bool,
|
|
||||||
trust_remote_code: bool,
|
trust_remote_code: bool,
|
||||||
dtype: str,
|
dtype: str,
|
||||||
max_model_len: Optional[int],
|
max_model_len: Optional[int],
|
||||||
@@ -90,7 +90,6 @@ def run_vllm(
|
|||||||
download_dir: Optional[str] = None,
|
download_dir: Optional[str] = None,
|
||||||
load_format: str = EngineArgs.load_format,
|
load_format: str = EngineArgs.load_format,
|
||||||
disable_async_output_proc: bool = False,
|
disable_async_output_proc: bool = False,
|
||||||
use_new_beam_search_impl: bool = False,
|
|
||||||
) -> float:
|
) -> float:
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
llm = LLM(
|
llm = LLM(
|
||||||
@@ -126,29 +125,32 @@ def run_vllm(
|
|||||||
sampling_params.append(
|
sampling_params.append(
|
||||||
SamplingParams(
|
SamplingParams(
|
||||||
n=n,
|
n=n,
|
||||||
temperature=0.0 if use_beam_search else 1.0,
|
temperature=1.0,
|
||||||
top_p=1.0,
|
top_p=1.0,
|
||||||
use_beam_search=use_beam_search,
|
|
||||||
ignore_eos=True,
|
ignore_eos=True,
|
||||||
max_tokens=output_len,
|
max_tokens=output_len,
|
||||||
))
|
))
|
||||||
|
|
||||||
if not use_new_beam_search_impl:
|
use_beam_search = False
|
||||||
|
|
||||||
|
if not use_beam_search:
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
llm.generate(prompts, sampling_params, use_tqdm=True)
|
llm.generate(prompts, sampling_params, use_tqdm=True)
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
else:
|
else:
|
||||||
assert use_beam_search
|
|
||||||
prompts = [prompt for prompt, _, _ in requests]
|
prompts = [prompt for prompt, _, _ in requests]
|
||||||
# output_len should be the same for all requests.
|
# output_len should be the same for all requests.
|
||||||
output_len = requests[0][2]
|
output_len = requests[0][2]
|
||||||
for prompt, input_len, _output_len in requests:
|
for prompt, input_len, _output_len in requests:
|
||||||
assert _output_len == output_len
|
assert _output_len == output_len
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
llm.beam_search(prompts,
|
llm.beam_search(
|
||||||
beam_width=n,
|
prompts,
|
||||||
max_tokens=output_len,
|
BeamSearchParams(
|
||||||
ignore_eos=True)
|
beam_width=n,
|
||||||
|
max_tokens=output_len,
|
||||||
|
ignore_eos=True,
|
||||||
|
))
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
return end - start
|
return end - start
|
||||||
|
|
||||||
@@ -161,7 +163,6 @@ async def run_vllm_async(
|
|||||||
tensor_parallel_size: int,
|
tensor_parallel_size: int,
|
||||||
seed: int,
|
seed: int,
|
||||||
n: int,
|
n: int,
|
||||||
use_beam_search: bool,
|
|
||||||
trust_remote_code: bool,
|
trust_remote_code: bool,
|
||||||
dtype: str,
|
dtype: str,
|
||||||
max_model_len: Optional[int],
|
max_model_len: Optional[int],
|
||||||
@@ -220,9 +221,8 @@ async def run_vllm_async(
|
|||||||
sampling_params.append(
|
sampling_params.append(
|
||||||
SamplingParams(
|
SamplingParams(
|
||||||
n=n,
|
n=n,
|
||||||
temperature=0.0 if use_beam_search else 1.0,
|
temperature=1.0,
|
||||||
top_p=1.0,
|
top_p=1.0,
|
||||||
use_beam_search=use_beam_search,
|
|
||||||
ignore_eos=True,
|
ignore_eos=True,
|
||||||
max_tokens=output_len,
|
max_tokens=output_len,
|
||||||
))
|
))
|
||||||
@@ -244,11 +244,9 @@ def run_hf(
|
|||||||
model: str,
|
model: str,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
n: int,
|
n: int,
|
||||||
use_beam_search: bool,
|
|
||||||
max_batch_size: int,
|
max_batch_size: int,
|
||||||
trust_remote_code: bool,
|
trust_remote_code: bool,
|
||||||
) -> float:
|
) -> float:
|
||||||
assert not use_beam_search
|
|
||||||
llm = AutoModelForCausalLM.from_pretrained(
|
llm = AutoModelForCausalLM.from_pretrained(
|
||||||
model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
|
model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
|
||||||
if llm.config.model_type == "llama":
|
if llm.config.model_type == "llama":
|
||||||
@@ -280,7 +278,7 @@ def run_hf(
|
|||||||
padding=True).input_ids
|
padding=True).input_ids
|
||||||
llm_outputs = llm.generate(
|
llm_outputs = llm.generate(
|
||||||
input_ids=input_ids.cuda(),
|
input_ids=input_ids.cuda(),
|
||||||
do_sample=not use_beam_search,
|
do_sample=True,
|
||||||
num_return_sequences=n,
|
num_return_sequences=n,
|
||||||
temperature=1.0,
|
temperature=1.0,
|
||||||
top_p=1.0,
|
top_p=1.0,
|
||||||
@@ -336,7 +334,7 @@ def main(args: argparse.Namespace):
|
|||||||
if args.backend == "vllm":
|
if args.backend == "vllm":
|
||||||
run_args = [
|
run_args = [
|
||||||
requests, args.model, args.tokenizer, args.quantization,
|
requests, args.model, args.tokenizer, args.quantization,
|
||||||
args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
|
args.tensor_parallel_size, args.seed, args.n,
|
||||||
args.trust_remote_code, args.dtype, args.max_model_len,
|
args.trust_remote_code, args.dtype, args.max_model_len,
|
||||||
args.enforce_eager, args.kv_cache_dtype,
|
args.enforce_eager, args.kv_cache_dtype,
|
||||||
args.quantization_param_path, args.device,
|
args.quantization_param_path, args.device,
|
||||||
@@ -351,12 +349,11 @@ def main(args: argparse.Namespace):
|
|||||||
run_args.append(args.disable_frontend_multiprocessing)
|
run_args.append(args.disable_frontend_multiprocessing)
|
||||||
elapsed_time = uvloop.run(run_vllm_async(*run_args))
|
elapsed_time = uvloop.run(run_vllm_async(*run_args))
|
||||||
else:
|
else:
|
||||||
elapsed_time = run_vllm(*run_args, args.use_new_beam_search_impl)
|
elapsed_time = run_vllm(*run_args)
|
||||||
elif args.backend == "hf":
|
elif args.backend == "hf":
|
||||||
assert args.tensor_parallel_size == 1
|
assert args.tensor_parallel_size == 1
|
||||||
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
||||||
args.use_beam_search, args.hf_max_batch_size,
|
args.hf_max_batch_size, args.trust_remote_code)
|
||||||
args.trust_remote_code)
|
|
||||||
elif args.backend == "mii":
|
elif args.backend == "mii":
|
||||||
elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
|
elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
|
||||||
args.output_len)
|
args.output_len)
|
||||||
@@ -410,8 +407,6 @@ if __name__ == "__main__":
|
|||||||
type=int,
|
type=int,
|
||||||
default=1,
|
default=1,
|
||||||
help="Number of generated sequences per prompt.")
|
help="Number of generated sequences per prompt.")
|
||||||
parser.add_argument("--use-beam-search", action="store_true")
|
|
||||||
parser.add_argument("--use-new-beam-search-impl", action="store_true")
|
|
||||||
parser.add_argument("--num-prompts",
|
parser.add_argument("--num-prompts",
|
||||||
type=int,
|
type=int,
|
||||||
default=1000,
|
default=1000,
|
||||||
@@ -478,6 +473,7 @@ if __name__ == "__main__":
|
|||||||
help="Maximum number of forward steps per scheduler call.")
|
help="Maximum number of forward steps per scheduler call.")
|
||||||
parser.add_argument("--use-v2-block-manager",
|
parser.add_argument("--use-v2-block-manager",
|
||||||
action='store_true',
|
action='store_true',
|
||||||
|
default=EngineArgs.use_v2_block_manager,
|
||||||
help="Enable block manager v2.")
|
help="Enable block manager v2.")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--enable-prefix-caching",
|
"--enable-prefix-caching",
|
||||||
@@ -566,8 +562,6 @@ if __name__ == "__main__":
|
|||||||
raise ValueError("dtype must be auto for MII backend.")
|
raise ValueError("dtype must be auto for MII backend.")
|
||||||
if args.n != 1:
|
if args.n != 1:
|
||||||
raise ValueError("n must be 1 for MII backend.")
|
raise ValueError("n must be 1 for MII backend.")
|
||||||
if args.use_beam_search:
|
|
||||||
raise ValueError("Beam search is not supported for MII backend.")
|
|
||||||
if args.quantization is not None:
|
if args.quantization is not None:
|
||||||
raise ValueError("Quantization is only for vLLM backend.")
|
raise ValueError("Quantization is only for vLLM backend.")
|
||||||
if args.hf_max_batch_size is not None:
|
if args.hf_max_batch_size is not None:
|
||||||
|
|||||||
@@ -84,7 +84,12 @@ endif()
|
|||||||
|
|
||||||
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
|
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
|
||||||
|
|
||||||
list(APPEND LIBS dnnl numa)
|
list(APPEND LIBS numa)
|
||||||
|
|
||||||
|
# Appending the dnnl library for the AVX2 and AVX512, as it is not utilized by Power architecture.
|
||||||
|
if (AVX2_FOUND OR AVX512_FOUND)
|
||||||
|
list(APPEND LIBS dnnl)
|
||||||
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
# _C extension
|
# _C extension
|
||||||
|
|||||||
@@ -133,10 +133,181 @@ macro(string_to_ver OUT_VER IN_STR)
|
|||||||
string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
|
string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
|
||||||
endmacro()
|
endmacro()
|
||||||
|
|
||||||
|
#
|
||||||
|
# Clear all `-gencode` flags from `CMAKE_CUDA_FLAGS` and store them in
|
||||||
|
# `CUDA_ARCH_FLAGS`.
|
||||||
|
#
|
||||||
|
# Example:
|
||||||
|
# CMAKE_CUDA_FLAGS="-Wall -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75"
|
||||||
|
# clear_cuda_arches(CUDA_ARCH_FLAGS)
|
||||||
|
# CUDA_ARCH_FLAGS="-gencode arch=compute_70,code=sm_70;-gencode arch=compute_75,code=sm_75"
|
||||||
|
# CMAKE_CUDA_FLAGS="-Wall"
|
||||||
|
#
|
||||||
|
macro(clear_cuda_arches CUDA_ARCH_FLAGS)
|
||||||
|
# Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS`
|
||||||
|
string(REGEX MATCHALL "-gencode arch=[^ ]+" CUDA_ARCH_FLAGS
|
||||||
|
${CMAKE_CUDA_FLAGS})
|
||||||
|
|
||||||
|
# Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified
|
||||||
|
# and passed back via the `CUDA_ARCHITECTURES` property.
|
||||||
|
string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS
|
||||||
|
${CMAKE_CUDA_FLAGS})
|
||||||
|
endmacro()
|
||||||
|
|
||||||
|
#
|
||||||
|
# Extract unique CUDA architectures from a list of compute capabilities codes in
|
||||||
|
# the form `<major><minor>[<letter>]`, convert them to the form sort
|
||||||
|
# `<major>.<minor>`, dedupes them and then sorts them in ascending order and
|
||||||
|
# stores them in `OUT_ARCHES`.
|
||||||
|
#
|
||||||
|
# Example:
|
||||||
|
# CUDA_ARCH_FLAGS="-gencode arch=compute_75,code=sm_75;...;-gencode arch=compute_90a,code=sm_90a"
|
||||||
|
# extract_unique_cuda_archs_ascending(OUT_ARCHES CUDA_ARCH_FLAGS)
|
||||||
|
# OUT_ARCHES="7.5;...;9.0"
|
||||||
|
function(extract_unique_cuda_archs_ascending OUT_ARCHES CUDA_ARCH_FLAGS)
|
||||||
|
set(_CUDA_ARCHES)
|
||||||
|
foreach(_ARCH ${CUDA_ARCH_FLAGS})
|
||||||
|
string(REGEX MATCH "arch=compute_\([0-9]+a?\)" _COMPUTE ${_ARCH})
|
||||||
|
if (_COMPUTE)
|
||||||
|
set(_COMPUTE ${CMAKE_MATCH_1})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
string_to_ver(_COMPUTE_VER ${_COMPUTE})
|
||||||
|
list(APPEND _CUDA_ARCHES ${_COMPUTE_VER})
|
||||||
|
endforeach()
|
||||||
|
|
||||||
|
list(REMOVE_DUPLICATES _CUDA_ARCHES)
|
||||||
|
list(SORT _CUDA_ARCHES COMPARE NATURAL ORDER ASCENDING)
|
||||||
|
set(${OUT_ARCHES} ${_CUDA_ARCHES} PARENT_SCOPE)
|
||||||
|
endfunction()
|
||||||
|
|
||||||
|
#
|
||||||
|
# For a specific file set the `-gencode` flag in compile options conditionally
|
||||||
|
# for the CUDA language.
|
||||||
|
#
|
||||||
|
# Example:
|
||||||
|
# set_gencode_flag_for_srcs(
|
||||||
|
# SRCS "foo.cu"
|
||||||
|
# ARCH "compute_75"
|
||||||
|
# CODE "sm_75")
|
||||||
|
# adds: "-gencode arch=compute_75,code=sm_75" to the compile options for
|
||||||
|
# `foo.cu` (only for the CUDA language).
|
||||||
|
#
|
||||||
|
macro(set_gencode_flag_for_srcs)
|
||||||
|
set(options)
|
||||||
|
set(oneValueArgs ARCH CODE)
|
||||||
|
set(multiValueArgs SRCS)
|
||||||
|
cmake_parse_arguments(arg "${options}" "${oneValueArgs}"
|
||||||
|
"${multiValueArgs}" ${ARGN} )
|
||||||
|
set(_FLAG -gencode arch=${arg_ARCH},code=${arg_CODE})
|
||||||
|
set_property(
|
||||||
|
SOURCE ${arg_SRCS}
|
||||||
|
APPEND PROPERTY
|
||||||
|
COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CUDA>:${_FLAG}>"
|
||||||
|
)
|
||||||
|
|
||||||
|
message(DEBUG "Setting gencode flag for ${arg_SRCS}: ${_FLAG}")
|
||||||
|
endmacro(set_gencode_flag_for_srcs)
|
||||||
|
|
||||||
|
#
|
||||||
|
# For a list of source files set the `-gencode` flags in the files specific
|
||||||
|
# compile options (specifically for the CUDA language).
|
||||||
|
#
|
||||||
|
# arguments are:
|
||||||
|
# SRCS: list of source files
|
||||||
|
# CUDA_ARCHS: list of CUDA architectures in the form `<major>.<minor>[letter]`
|
||||||
|
# BUILD_PTX_FOR_ARCH: if set to true, then the PTX code will be built
|
||||||
|
# for architecture `BUILD_PTX_FOR_ARCH` if there is a CUDA_ARCH in CUDA_ARCHS
|
||||||
|
# that is larger than BUILD_PTX_FOR_ARCH.
|
||||||
|
#
|
||||||
|
macro(set_gencode_flags_for_srcs)
|
||||||
|
set(options)
|
||||||
|
set(oneValueArgs BUILD_PTX_FOR_ARCH)
|
||||||
|
set(multiValueArgs SRCS CUDA_ARCHS)
|
||||||
|
cmake_parse_arguments(arg "${options}" "${oneValueArgs}"
|
||||||
|
"${multiValueArgs}" ${ARGN} )
|
||||||
|
|
||||||
|
foreach(_ARCH ${arg_CUDA_ARCHS})
|
||||||
|
string(REPLACE "." "" _ARCH "${_ARCH}")
|
||||||
|
set_gencode_flag_for_srcs(
|
||||||
|
SRCS ${arg_SRCS}
|
||||||
|
ARCH "compute_${_ARCH}"
|
||||||
|
CODE "sm_${_ARCH}")
|
||||||
|
endforeach()
|
||||||
|
|
||||||
|
if (${arg_BUILD_PTX_FOR_ARCH})
|
||||||
|
list(SORT arg_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
|
||||||
|
list(GET arg_CUDA_ARCHS -1 _HIGHEST_ARCH)
|
||||||
|
if (_HIGHEST_ARCH VERSION_GREATER_EQUAL ${arg_BUILD_PTX_FOR_ARCH})
|
||||||
|
string(REPLACE "." "" _PTX_ARCH "${arg_BUILD_PTX_FOR_ARCH}")
|
||||||
|
set_gencode_flag_for_srcs(
|
||||||
|
SRCS ${arg_SRCS}
|
||||||
|
ARCH "compute_${_PTX_ARCH}"
|
||||||
|
CODE "compute_${_PTX_ARCH}")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
endmacro()
|
||||||
|
|
||||||
|
#
|
||||||
|
# For the given `SRC_CUDA_ARCHS` list of gencode versions in the form
|
||||||
|
# `<major>.<minor>[letter]` compute the "loose intersection" with the
|
||||||
|
# `TGT_CUDA_ARCHS` list of gencodes.
|
||||||
|
# The loose intersection is defined as:
|
||||||
|
# { max{ x \in tgt | x <= y } | y \in src, { x \in tgt | x <= y } != {} }
|
||||||
|
# where `<=` is the version comparison operator.
|
||||||
|
# In other words, for each version in `TGT_CUDA_ARCHS` find the highest version
|
||||||
|
# in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`.
|
||||||
|
# We have special handling for 9.0a, if 9.0a is in `SRC_CUDA_ARCHS` and 9.0 is
|
||||||
|
# in `TGT_CUDA_ARCHS` then we should remove 9.0a from `SRC_CUDA_ARCHS` and add
|
||||||
|
# 9.0a to the result.
|
||||||
|
# The result is stored in `OUT_CUDA_ARCHS`.
|
||||||
|
#
|
||||||
|
# Example:
|
||||||
|
# SRC_CUDA_ARCHS="7.5;8.0;8.6;9.0;9.0a"
|
||||||
|
# TGT_CUDA_ARCHS="8.0;8.9;9.0"
|
||||||
|
# cuda_archs_loose_intersection(OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
|
||||||
|
# OUT_CUDA_ARCHS="8.0;8.6;9.0;9.0a"
|
||||||
|
#
|
||||||
|
function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
|
||||||
|
list(REMOVE_DUPLICATES SRC_CUDA_ARCHS)
|
||||||
|
|
||||||
|
# if 9.0a is in SRC_CUDA_ARCHS and 9.0 is in CUDA_ARCHS then we should
|
||||||
|
# remove 9.0a from SRC_CUDA_ARCHS and add 9.0a to _CUDA_ARCHS
|
||||||
|
set(_CUDA_ARCHS)
|
||||||
|
if ("9.0a" IN_LIST SRC_CUDA_ARCHS)
|
||||||
|
list(REMOVE_ITEM SRC_CUDA_ARCHS "9.0a")
|
||||||
|
if ("9.0" IN_LIST TGT_CUDA_ARCHS)
|
||||||
|
set(_CUDA_ARCHS "9.0a")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
list(SORT SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
|
||||||
|
|
||||||
|
# for each ARCH in CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that is
|
||||||
|
# less or eqault to ARCH
|
||||||
|
foreach(_ARCH ${CUDA_ARCHS})
|
||||||
|
set(_TMP_ARCH)
|
||||||
|
foreach(_SRC_ARCH ${SRC_CUDA_ARCHS})
|
||||||
|
if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH)
|
||||||
|
set(_TMP_ARCH ${_SRC_ARCH})
|
||||||
|
else()
|
||||||
|
break()
|
||||||
|
endif()
|
||||||
|
endforeach()
|
||||||
|
if (_TMP_ARCH)
|
||||||
|
list(APPEND _CUDA_ARCHS ${_TMP_ARCH})
|
||||||
|
endif()
|
||||||
|
endforeach()
|
||||||
|
|
||||||
|
list(REMOVE_DUPLICATES _CUDA_ARCHS)
|
||||||
|
set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE)
|
||||||
|
endfunction()
|
||||||
|
|
||||||
#
|
#
|
||||||
# Override the GPU architectures detected by cmake/torch and filter them by
|
# Override the GPU architectures detected by cmake/torch and filter them by
|
||||||
# `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in
|
# `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in
|
||||||
# `GPU_ARCHES`.
|
# `GPU_ARCHES`. This only applies to the HIP language since for CUDA we set
|
||||||
|
# the architectures on a per file basis.
|
||||||
#
|
#
|
||||||
# Note: this is defined as a macro since it updates `CMAKE_CUDA_FLAGS`.
|
# Note: this is defined as a macro since it updates `CMAKE_CUDA_FLAGS`.
|
||||||
#
|
#
|
||||||
@@ -174,109 +345,7 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
|
|||||||
"None of the detected ROCm architectures: ${HIP_ARCHITECTURES} is"
|
"None of the detected ROCm architectures: ${HIP_ARCHITECTURES} is"
|
||||||
" supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.")
|
" supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
elseif(${GPU_LANG} STREQUAL "CUDA")
|
|
||||||
#
|
|
||||||
# Setup/process CUDA arch flags.
|
|
||||||
#
|
|
||||||
# The torch cmake setup hardcodes the detected architecture flags in
|
|
||||||
# `CMAKE_CUDA_FLAGS`. Since `CMAKE_CUDA_FLAGS` is a "global" variable, it
|
|
||||||
# can't modified on a per-target basis.
|
|
||||||
# So, all the `-gencode` flags need to be extracted and removed from
|
|
||||||
# `CMAKE_CUDA_FLAGS` for processing so they can be passed by another method.
|
|
||||||
# Since it's not possible to use `target_compiler_options` for adding target
|
|
||||||
# specific `-gencode` arguments, the target's `CUDA_ARCHITECTURES` property
|
|
||||||
# must be used instead. This requires repackaging the architecture flags
|
|
||||||
# into a format that cmake expects for `CUDA_ARCHITECTURES`.
|
|
||||||
#
|
|
||||||
# This is a bit fragile in that it depends on torch using `-gencode` as opposed
|
|
||||||
# to one of the other nvcc options to specify architectures.
|
|
||||||
#
|
|
||||||
# Note: torch uses the `TORCH_CUDA_ARCH_LIST` environment variable to override
|
|
||||||
# detected architectures.
|
|
||||||
#
|
|
||||||
message(DEBUG "initial CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
|
|
||||||
|
|
||||||
# Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS`
|
|
||||||
string(REGEX MATCHALL "-gencode arch=[^ ]+" _CUDA_ARCH_FLAGS
|
|
||||||
${CMAKE_CUDA_FLAGS})
|
|
||||||
|
|
||||||
# Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified
|
|
||||||
# and passed back via the `CUDA_ARCHITECTURES` property.
|
|
||||||
string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS
|
|
||||||
${CMAKE_CUDA_FLAGS})
|
|
||||||
|
|
||||||
# If this error is triggered, it might mean that torch has changed how it sets
|
|
||||||
# up nvcc architecture code generation flags.
|
|
||||||
if (NOT _CUDA_ARCH_FLAGS)
|
|
||||||
message(FATAL_ERROR
|
|
||||||
"Could not find any architecture related code generation flags in "
|
|
||||||
"CMAKE_CUDA_FLAGS. (${CMAKE_CUDA_FLAGS})")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
message(DEBUG "final CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
|
|
||||||
message(DEBUG "arch flags: ${_CUDA_ARCH_FLAGS}")
|
|
||||||
|
|
||||||
# Initialize the architecture lists to empty.
|
|
||||||
set(${GPU_ARCHES})
|
|
||||||
|
|
||||||
# Process each `gencode` flag.
|
|
||||||
foreach(_ARCH ${_CUDA_ARCH_FLAGS})
|
|
||||||
# For each flag, extract the version number and whether it refers to PTX
|
|
||||||
# or native code.
|
|
||||||
# Note: if a regex matches then `CMAKE_MATCH_1` holds the binding
|
|
||||||
# for that match.
|
|
||||||
|
|
||||||
string(REGEX MATCH "arch=compute_\([0-9]+a?\)" _COMPUTE ${_ARCH})
|
|
||||||
if (_COMPUTE)
|
|
||||||
set(_COMPUTE ${CMAKE_MATCH_1})
|
|
||||||
endif()
|
|
||||||
|
|
||||||
string(REGEX MATCH "code=sm_\([0-9]+a?\)" _SM ${_ARCH})
|
|
||||||
if (_SM)
|
|
||||||
set(_SM ${CMAKE_MATCH_1})
|
|
||||||
endif()
|
|
||||||
|
|
||||||
string(REGEX MATCH "code=compute_\([0-9]+a?\)" _CODE ${_ARCH})
|
|
||||||
if (_CODE)
|
|
||||||
set(_CODE ${CMAKE_MATCH_1})
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# Make sure the virtual architecture can be matched.
|
|
||||||
if (NOT _COMPUTE)
|
|
||||||
message(FATAL_ERROR
|
|
||||||
"Could not determine virtual architecture from: ${_ARCH}.")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# One of sm_ or compute_ must exist.
|
|
||||||
if ((NOT _SM) AND (NOT _CODE))
|
|
||||||
message(FATAL_ERROR
|
|
||||||
"Could not determine a codegen architecture from: ${_ARCH}.")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (_SM)
|
|
||||||
# -real suffix let CMake to only generate elf code for the kernels.
|
|
||||||
# we want this, otherwise the added ptx (default) will increase binary size.
|
|
||||||
set(_VIRT "-real")
|
|
||||||
set(_CODE_ARCH ${_SM})
|
|
||||||
else()
|
|
||||||
# -virtual suffix let CMake to generate ptx code for the kernels.
|
|
||||||
set(_VIRT "-virtual")
|
|
||||||
set(_CODE_ARCH ${_CODE})
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# Check if the current version is in the supported arch list.
|
|
||||||
string_to_ver(_CODE_VER ${_CODE_ARCH})
|
|
||||||
if (NOT _CODE_VER IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
|
|
||||||
message(STATUS "discarding unsupported CUDA arch ${_VER}.")
|
|
||||||
continue()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# Add it to the arch list.
|
|
||||||
list(APPEND ${GPU_ARCHES} "${_CODE_ARCH}${_VIRT}")
|
|
||||||
endforeach()
|
|
||||||
endif()
|
endif()
|
||||||
message(STATUS "${GPU_LANG} target arches: ${${GPU_ARCHES}}")
|
|
||||||
endmacro()
|
endmacro()
|
||||||
|
|
||||||
#
|
#
|
||||||
|
|||||||
@@ -267,13 +267,16 @@ def get_neuron_sdk_version(run_lambda):
|
|||||||
|
|
||||||
|
|
||||||
def get_vllm_version():
|
def get_vllm_version():
|
||||||
try:
|
from vllm import __version__, __version_tuple__
|
||||||
import vllm
|
|
||||||
return vllm.__version__ + "@" + vllm.__commit__
|
|
||||||
except Exception:
|
|
||||||
# old version of vllm does not have __commit__
|
|
||||||
return 'N/A'
|
|
||||||
|
|
||||||
|
if __version__ == "dev":
|
||||||
|
return "N/A (dev)"
|
||||||
|
|
||||||
|
if len(__version_tuple__) == 4: # dev build
|
||||||
|
git_sha = __version_tuple__[-1][1:] # type: ignore
|
||||||
|
return f"{__version__} (git sha: {git_sha}"
|
||||||
|
|
||||||
|
return __version__
|
||||||
|
|
||||||
def summarize_vllm_build_flags():
|
def summarize_vllm_build_flags():
|
||||||
# This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
|
# This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
|
||||||
|
|||||||
3
csrc/core/exception.hpp
Normal file
3
csrc/core/exception.hpp
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#define VLLM_IMPLIES(p, q) (!(p) || (q))
|
||||||
@@ -12,6 +12,11 @@
|
|||||||
// could be a macro instead of a literal token.
|
// could be a macro instead of a literal token.
|
||||||
#define TORCH_LIBRARY_EXPAND(NAME, MODULE) TORCH_LIBRARY(NAME, MODULE)
|
#define TORCH_LIBRARY_EXPAND(NAME, MODULE) TORCH_LIBRARY(NAME, MODULE)
|
||||||
|
|
||||||
|
// A version of the TORCH_LIBRARY_IMPL macro that expands the NAME, i.e. so NAME
|
||||||
|
// could be a macro instead of a literal token.
|
||||||
|
#define TORCH_LIBRARY_IMPL_EXPAND(NAME, DEVICE, MODULE) \
|
||||||
|
TORCH_LIBRARY_IMPL(NAME, DEVICE, MODULE)
|
||||||
|
|
||||||
// REGISTER_EXTENSION allows the shared library to be loaded and initialized
|
// REGISTER_EXTENSION allows the shared library to be loaded and initialized
|
||||||
// via python's import statement.
|
// via python's import statement.
|
||||||
#define REGISTER_EXTENSION(NAME) \
|
#define REGISTER_EXTENSION(NAME) \
|
||||||
|
|||||||
@@ -39,8 +39,6 @@
|
|||||||
|
|
||||||
template<typename input_t, typename weight_t>
|
template<typename input_t, typename weight_t>
|
||||||
void causal_conv1d_fwd_cuda(ConvParamsBase ¶ms, cudaStream_t stream);
|
void causal_conv1d_fwd_cuda(ConvParamsBase ¶ms, cudaStream_t stream);
|
||||||
template <typename input_t, typename weight_t>
|
|
||||||
void causal_conv1d_channellast_fwd_cuda(ConvParamsBase ¶ms, cudaStream_t stream);
|
|
||||||
|
|
||||||
template<typename input_t, typename weight_t>
|
template<typename input_t, typename weight_t>
|
||||||
void causal_conv1d_update_cuda(ConvParamsBase ¶ms, cudaStream_t stream);
|
void causal_conv1d_update_cuda(ConvParamsBase ¶ms, cudaStream_t stream);
|
||||||
@@ -55,8 +53,11 @@ void set_conv_params_fwd(ConvParamsBase ¶ms,
|
|||||||
const at::Tensor x,
|
const at::Tensor x,
|
||||||
const at::Tensor weight,
|
const at::Tensor weight,
|
||||||
const at::Tensor out,
|
const at::Tensor out,
|
||||||
void* bias_ptr,
|
const c10::optional<at::Tensor>& bias,
|
||||||
bool silu_activation) {
|
bool silu_activation,
|
||||||
|
const c10::optional<at::Tensor>& query_start_loc = std::nullopt,
|
||||||
|
const c10::optional<at::Tensor>& cache_indices = std::nullopt,
|
||||||
|
const c10::optional<at::Tensor>& has_initial_state = std::nullopt) {
|
||||||
|
|
||||||
// Reset the parameters
|
// Reset the parameters
|
||||||
memset(¶ms, 0, sizeof(params));
|
memset(¶ms, 0, sizeof(params));
|
||||||
@@ -71,26 +72,31 @@ void set_conv_params_fwd(ConvParamsBase ¶ms,
|
|||||||
// Set the pointers and strides.
|
// Set the pointers and strides.
|
||||||
params.x_ptr = x.data_ptr();
|
params.x_ptr = x.data_ptr();
|
||||||
params.weight_ptr = weight.data_ptr();
|
params.weight_ptr = weight.data_ptr();
|
||||||
params.bias_ptr = bias_ptr;
|
params.bias_ptr = bias.has_value() ? bias.value().data_ptr() : nullptr;
|
||||||
params.out_ptr = out.data_ptr();
|
params.out_ptr = out.data_ptr();
|
||||||
// All stride are in elements, not bytes.
|
// All stride are in elements, not bytes.
|
||||||
params.x_batch_stride = x.stride(0);
|
params.query_start_loc_ptr = query_start_loc.has_value() ? query_start_loc.value().data_ptr() : nullptr;
|
||||||
params.x_c_stride = x.stride(1);
|
params.cache_indices_ptr = cache_indices.has_value() ? cache_indices.value().data_ptr() : nullptr;
|
||||||
params.x_l_stride = x.stride(-1);
|
params.has_initial_state_ptr = has_initial_state.has_value() ? has_initial_state.value().data_ptr() : nullptr;
|
||||||
|
const bool varlen = params.query_start_loc_ptr != nullptr;
|
||||||
|
params.x_batch_stride = x.stride(varlen ? 1 : 0);
|
||||||
|
params.x_c_stride = x.stride(varlen ? 0 : 1);
|
||||||
|
params.x_l_stride = x.stride(varlen ? 1 : -1);
|
||||||
params.weight_c_stride = weight.stride(0);
|
params.weight_c_stride = weight.stride(0);
|
||||||
params.weight_width_stride = weight.stride(1);
|
params.weight_width_stride = weight.stride(1);
|
||||||
params.out_batch_stride = out.stride(0);
|
params.out_batch_stride = out.stride(varlen ? 1 : 0);
|
||||||
params.out_c_stride = out.stride(1);
|
params.out_c_stride = out.stride(varlen ? 0 : 1);
|
||||||
params.out_l_stride = out.stride(-1);
|
params.out_l_stride = out.stride(varlen ? 1 : -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
at::Tensor
|
at::Tensor
|
||||||
causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
|
causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
|
||||||
const c10::optional<at::Tensor> &bias_,
|
const c10::optional<at::Tensor> &bias_,
|
||||||
const c10::optional<at::Tensor> &seq_idx_,
|
const c10::optional<at::Tensor> &conv_states,
|
||||||
const c10::optional<at::Tensor> &initial_states_,
|
const c10::optional<at::Tensor> &query_start_loc,
|
||||||
const c10::optional<at::Tensor> &final_states_out_,
|
const c10::optional<at::Tensor> &cache_indices,
|
||||||
|
const c10::optional<at::Tensor> &has_initial_state,
|
||||||
bool silu_activation) {
|
bool silu_activation) {
|
||||||
auto input_type = x.scalar_type();
|
auto input_type = x.scalar_type();
|
||||||
auto weight_type = weight.scalar_type();
|
auto weight_type = weight.scalar_type();
|
||||||
@@ -100,23 +106,21 @@ causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
|
|||||||
TORCH_CHECK(x.is_cuda());
|
TORCH_CHECK(x.is_cuda());
|
||||||
TORCH_CHECK(weight.is_cuda());
|
TORCH_CHECK(weight.is_cuda());
|
||||||
|
|
||||||
|
const bool varlen = query_start_loc.has_value() ? true : false;
|
||||||
const auto sizes = x.sizes();
|
const auto sizes = x.sizes();
|
||||||
const int batch_size = sizes[0];
|
const int batch_size = varlen ? query_start_loc.value().sizes()[0] - 1 : sizes[0];
|
||||||
const int dim = sizes[1];
|
const int dim = varlen ? sizes[0] : sizes[1];
|
||||||
const int seqlen = sizes[2];
|
const int seqlen = varlen ? sizes[1] : sizes[2];
|
||||||
const int width = weight.size(-1);
|
const int width = weight.size(-1);
|
||||||
|
if (varlen){
|
||||||
CHECK_SHAPE(x, batch_size, dim, seqlen);
|
CHECK_SHAPE(x, dim, seqlen);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
CHECK_SHAPE(x, batch_size, dim, seqlen);
|
||||||
|
}
|
||||||
CHECK_SHAPE(weight, dim, width);
|
CHECK_SHAPE(weight, dim, width);
|
||||||
|
|
||||||
TORCH_CHECK(x.stride(2) == 1 || x.stride(1) == 1);
|
|
||||||
const bool is_channel_last = x.stride(1) == 1 && x.stride(2) > 1;
|
|
||||||
|
|
||||||
if (is_channel_last) {
|
|
||||||
TORCH_CHECK(dim % 8 == 0, "causal_conv1d only supports channel dimension divisible by 8 for now");
|
|
||||||
TORCH_CHECK(x.stride(2) % 8 == 0 and x.stride(0) % 8 == 0, "causal_conv1d with channel last layout requires strides (x.stride(0) and x.stride(2)) to be multiples of 8");
|
|
||||||
}
|
|
||||||
TORCH_CHECK(width >= 2 && width <= 4, "causal_conv1d only supports width between 2 and 4");
|
|
||||||
|
|
||||||
if (bias_.has_value()) {
|
if (bias_.has_value()) {
|
||||||
auto bias = bias_.value();
|
auto bias = bias_.value();
|
||||||
@@ -126,56 +130,50 @@ causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
|
|||||||
CHECK_SHAPE(bias, dim);
|
CHECK_SHAPE(bias, dim);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (seq_idx_.has_value()) {
|
|
||||||
TORCH_CHECK(is_channel_last, "seq_idx is only supported for channel last layout");
|
if (has_initial_state.has_value()) {
|
||||||
auto seq_idx = seq_idx_.value();
|
auto has_initial_state_ = has_initial_state.value();
|
||||||
TORCH_CHECK(seq_idx.scalar_type() == torch::kInt32);
|
TORCH_CHECK(has_initial_state_.scalar_type() == at::ScalarType::Bool);
|
||||||
TORCH_CHECK(seq_idx.is_cuda());
|
TORCH_CHECK(has_initial_state_.is_cuda());
|
||||||
TORCH_CHECK(seq_idx.is_contiguous());
|
CHECK_SHAPE(has_initial_state_, batch_size);
|
||||||
CHECK_SHAPE(seq_idx, batch_size, seqlen);
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if (query_start_loc.has_value()) {
|
||||||
|
auto query_start_loc_ = query_start_loc.value();
|
||||||
|
TORCH_CHECK(query_start_loc_.scalar_type() == at::ScalarType::Int);
|
||||||
|
TORCH_CHECK(query_start_loc_.is_cuda());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if (cache_indices.has_value()) {
|
||||||
|
auto cache_indices_ = cache_indices.value();
|
||||||
|
TORCH_CHECK(cache_indices_.scalar_type() == at::ScalarType::Int);
|
||||||
|
TORCH_CHECK(cache_indices_.is_cuda());
|
||||||
|
CHECK_SHAPE(cache_indices_, batch_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
at::Tensor out = torch::empty_like(x);
|
at::Tensor out = torch::empty_like(x);
|
||||||
|
|
||||||
ConvParamsBase params;
|
ConvParamsBase params;
|
||||||
set_conv_params_fwd(params, batch_size, dim, seqlen, width, x, weight, out,
|
set_conv_params_fwd(params, batch_size, dim, seqlen, width, x, weight, out,
|
||||||
bias_.has_value() ? bias_.value().data_ptr() : nullptr,
|
bias_,
|
||||||
silu_activation);
|
silu_activation,
|
||||||
|
query_start_loc,
|
||||||
|
cache_indices,
|
||||||
|
has_initial_state
|
||||||
|
);
|
||||||
|
|
||||||
if (seq_idx_.has_value()) {
|
if (conv_states.has_value()) {
|
||||||
params.seq_idx_ptr = seq_idx_.value().data_ptr();
|
auto conv_states_ = conv_states.value();
|
||||||
|
TORCH_CHECK(conv_states_.scalar_type() == input_type);
|
||||||
|
TORCH_CHECK(conv_states_.is_cuda());
|
||||||
|
params.conv_states_ptr = conv_states_.data_ptr();
|
||||||
|
params.conv_states_batch_stride = conv_states_.stride(0);
|
||||||
|
params.conv_states_c_stride = conv_states_.stride(1);
|
||||||
|
params.conv_states_l_stride = conv_states_.stride(2);
|
||||||
} else {
|
} else {
|
||||||
params.seq_idx_ptr = nullptr;
|
params.conv_states_ptr = nullptr;
|
||||||
}
|
|
||||||
|
|
||||||
if (initial_states_.has_value()) {
|
|
||||||
TORCH_CHECK(is_channel_last, "initial_states is only supported for channel last layout");
|
|
||||||
auto initial_states = initial_states_.value();
|
|
||||||
TORCH_CHECK(initial_states.scalar_type() == input_type);
|
|
||||||
TORCH_CHECK(initial_states.is_cuda());
|
|
||||||
CHECK_SHAPE(initial_states, batch_size, dim, width - 1);
|
|
||||||
TORCH_CHECK(initial_states.stride(1) == 1);
|
|
||||||
params.initial_states_ptr = initial_states.data_ptr();
|
|
||||||
params.initial_states_batch_stride = initial_states.stride(0);
|
|
||||||
params.initial_states_c_stride = initial_states.stride(1);
|
|
||||||
params.initial_states_l_stride = initial_states.stride(2);
|
|
||||||
} else {
|
|
||||||
params.initial_states_ptr = nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (final_states_out_.has_value()) {
|
|
||||||
TORCH_CHECK(is_channel_last, "final_states is only supported for channel last layout");
|
|
||||||
auto final_states = final_states_out_.value();
|
|
||||||
TORCH_CHECK(final_states.scalar_type() == input_type);
|
|
||||||
TORCH_CHECK(final_states.is_cuda());
|
|
||||||
CHECK_SHAPE(final_states, batch_size, dim, width - 1);
|
|
||||||
TORCH_CHECK(final_states.stride(1) == 1);
|
|
||||||
params.final_states_ptr = final_states.data_ptr();
|
|
||||||
params.final_states_batch_stride = final_states.stride(0);
|
|
||||||
params.final_states_c_stride = final_states.stride(1);
|
|
||||||
params.final_states_l_stride = final_states.stride(2);
|
|
||||||
} else {
|
|
||||||
params.final_states_ptr = nullptr;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Otherwise the kernel will be launched from cuda:0 device
|
// Otherwise the kernel will be launched from cuda:0 device
|
||||||
@@ -183,11 +181,7 @@ causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
|
|||||||
at::cuda::CUDAGuard device_guard{(char)x.get_device()};
|
at::cuda::CUDAGuard device_guard{(char)x.get_device()};
|
||||||
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
||||||
DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_fwd", [&] {
|
DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_fwd", [&] {
|
||||||
if (!is_channel_last) {
|
causal_conv1d_fwd_cuda<input_t, weight_t>(params, stream);
|
||||||
causal_conv1d_fwd_cuda<input_t, weight_t>(params, stream);
|
|
||||||
} else {
|
|
||||||
causal_conv1d_channellast_fwd_cuda<input_t, weight_t>(params, stream);
|
|
||||||
}
|
|
||||||
});
|
});
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
@@ -199,6 +193,7 @@ causal_conv1d_update(const at::Tensor &x,
|
|||||||
const at::Tensor &weight,
|
const at::Tensor &weight,
|
||||||
const c10::optional<at::Tensor> &bias_,
|
const c10::optional<at::Tensor> &bias_,
|
||||||
bool silu_activation,
|
bool silu_activation,
|
||||||
|
const c10::optional<at::Tensor> &cache_seqlens_,
|
||||||
const c10::optional<at::Tensor> &conv_state_indices_) {
|
const c10::optional<at::Tensor> &conv_state_indices_) {
|
||||||
auto input_type = x.scalar_type();
|
auto input_type = x.scalar_type();
|
||||||
auto weight_type = weight.scalar_type();
|
auto weight_type = weight.scalar_type();
|
||||||
@@ -214,9 +209,12 @@ causal_conv1d_update(const at::Tensor &x,
|
|||||||
const auto sizes = x.sizes();
|
const auto sizes = x.sizes();
|
||||||
const int batch_size = sizes[0];
|
const int batch_size = sizes[0];
|
||||||
const int dim = sizes[1];
|
const int dim = sizes[1];
|
||||||
|
const int seqlen = sizes[2];
|
||||||
const int width = weight.size(-1);
|
const int width = weight.size(-1);
|
||||||
|
const int conv_state_len = conv_state.size(2);
|
||||||
|
TORCH_CHECK(conv_state_len >= width - 1);
|
||||||
|
|
||||||
CHECK_SHAPE(x, batch_size, dim);
|
CHECK_SHAPE(x, batch_size, dim, seqlen);
|
||||||
CHECK_SHAPE(weight, dim, width);
|
CHECK_SHAPE(weight, dim, width);
|
||||||
|
|
||||||
TORCH_CHECK(width >= 2 && width <= 4, "causal_conv1d only supports width between 2 and 4");
|
TORCH_CHECK(width >= 2 && width <= 4, "causal_conv1d only supports width between 2 and 4");
|
||||||
@@ -232,15 +230,27 @@ causal_conv1d_update(const at::Tensor &x,
|
|||||||
at::Tensor out = torch::empty_like(x);
|
at::Tensor out = torch::empty_like(x);
|
||||||
|
|
||||||
ConvParamsBase params;
|
ConvParamsBase params;
|
||||||
set_conv_params_fwd(params, batch_size, dim, /*seqlen=*/1, width, x, weight, out,
|
set_conv_params_fwd(params, batch_size, dim, seqlen, width, x, weight, out,
|
||||||
bias_.has_value() ? bias_.value().data_ptr() : nullptr,
|
bias_,
|
||||||
silu_activation);
|
silu_activation);
|
||||||
params.conv_state_ptr = conv_state.data_ptr();
|
params.conv_state_ptr = conv_state.data_ptr();
|
||||||
|
params.conv_state_len = conv_state_len;
|
||||||
// All stride are in elements, not bytes.
|
// All stride are in elements, not bytes.
|
||||||
params.conv_state_batch_stride = conv_state.stride(0);
|
params.conv_state_batch_stride = conv_state.stride(0);
|
||||||
params.conv_state_c_stride = conv_state.stride(1);
|
params.conv_state_c_stride = conv_state.stride(1);
|
||||||
params.conv_state_l_stride = conv_state.stride(2);
|
params.conv_state_l_stride = conv_state.stride(2);
|
||||||
|
|
||||||
|
if (cache_seqlens_.has_value()) {
|
||||||
|
auto cache_seqlens = cache_seqlens_.value();
|
||||||
|
TORCH_CHECK(cache_seqlens.scalar_type() == torch::kInt32);
|
||||||
|
TORCH_CHECK(cache_seqlens.is_cuda());
|
||||||
|
TORCH_CHECK(cache_seqlens.stride(-1) == 1);
|
||||||
|
CHECK_SHAPE(cache_seqlens, batch_size);
|
||||||
|
params.cache_seqlens = cache_seqlens.data_ptr<int32_t>();
|
||||||
|
} else {
|
||||||
|
params.cache_seqlens = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
if (conv_state_indices_.has_value()) {
|
if (conv_state_indices_.has_value()) {
|
||||||
auto conv_state_indices = conv_state_indices_.value();
|
auto conv_state_indices = conv_state_indices_.value();
|
||||||
TORCH_CHECK(conv_state_indices.scalar_type() == torch::kInt32)
|
TORCH_CHECK(conv_state_indices.scalar_type() == torch::kInt32)
|
||||||
@@ -249,11 +259,11 @@ causal_conv1d_update(const at::Tensor &x,
|
|||||||
CHECK_SHAPE(conv_state_indices, batch_size);
|
CHECK_SHAPE(conv_state_indices, batch_size);
|
||||||
|
|
||||||
int conv_state_entries = conv_state.size(0);
|
int conv_state_entries = conv_state.size(0);
|
||||||
CHECK_SHAPE(conv_state, conv_state_entries, dim, width);
|
CHECK_SHAPE(conv_state, conv_state_entries, dim, conv_state_len);
|
||||||
|
|
||||||
params.conv_state_indices_ptr = conv_state_indices.data_ptr<int32_t>();
|
params.conv_state_indices_ptr = conv_state_indices.data_ptr<int32_t>();
|
||||||
} else {
|
} else {
|
||||||
CHECK_SHAPE(conv_state, batch_size, dim, width);
|
CHECK_SHAPE(conv_state, batch_size, dim, conv_state_len);
|
||||||
params.conv_state_indices_ptr = nullptr;
|
params.conv_state_indices_ptr = nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -296,7 +306,7 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
|
|||||||
constexpr int kWidth = Ktraits::kWidth;
|
constexpr int kWidth = Ktraits::kWidth;
|
||||||
constexpr int kNThreads = Ktraits::kNThreads;
|
constexpr int kNThreads = Ktraits::kNThreads;
|
||||||
constexpr int kNElts = Ktraits::kNElts;
|
constexpr int kNElts = Ktraits::kNElts;
|
||||||
static constexpr bool kIsVecLoad = Ktraits::kIsVecLoad;
|
constexpr bool kIsVecLoad = Ktraits::kIsVecLoad;
|
||||||
using input_t = typename Ktraits::input_t;
|
using input_t = typename Ktraits::input_t;
|
||||||
using vec_t = typename Ktraits::vec_t;
|
using vec_t = typename Ktraits::vec_t;
|
||||||
using weight_t = typename Ktraits::weight_t;
|
using weight_t = typename Ktraits::weight_t;
|
||||||
@@ -309,20 +319,39 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
|
|||||||
auto& smem_store_vec = reinterpret_cast<typename Ktraits::BlockStoreVecT::TempStorage&>(smem_);
|
auto& smem_store_vec = reinterpret_cast<typename Ktraits::BlockStoreVecT::TempStorage&>(smem_);
|
||||||
vec_t *smem_exchange = reinterpret_cast<vec_t *>(smem_ + Ktraits::kSmemIOSize);
|
vec_t *smem_exchange = reinterpret_cast<vec_t *>(smem_ + Ktraits::kSmemIOSize);
|
||||||
|
|
||||||
|
const bool kVarlen = params.query_start_loc_ptr != nullptr;
|
||||||
const int tidx = threadIdx.x;
|
const int tidx = threadIdx.x;
|
||||||
const int batch_id = blockIdx.x;
|
const int batch_id = blockIdx.x;
|
||||||
const int channel_id = blockIdx.y;
|
const int channel_id = blockIdx.y;
|
||||||
input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + batch_id * params.x_batch_stride
|
const int *query_start_loc = kVarlen ? reinterpret_cast<int *>(params.query_start_loc_ptr) : nullptr;
|
||||||
|
const int sequence_start_index = kVarlen ? query_start_loc[batch_id] : batch_id;
|
||||||
|
const int seqlen = kVarlen ? query_start_loc[batch_id + 1] - sequence_start_index : params.seqlen;
|
||||||
|
|
||||||
|
input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + sequence_start_index * params.x_batch_stride
|
||||||
+ channel_id * params.x_c_stride;
|
+ channel_id * params.x_c_stride;
|
||||||
weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr) + channel_id * params.weight_c_stride;
|
weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr) + channel_id * params.weight_c_stride;
|
||||||
input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + batch_id * params.out_batch_stride
|
input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + sequence_start_index * params.out_batch_stride
|
||||||
+ channel_id * params.out_c_stride;
|
+ channel_id * params.out_c_stride;
|
||||||
float bias_val = params.bias_ptr == nullptr ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[channel_id]);
|
float bias_val = params.bias_ptr == nullptr ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[channel_id]);
|
||||||
|
|
||||||
|
bool has_initial_state = params.has_initial_state_ptr == nullptr ? false
|
||||||
|
: reinterpret_cast<bool *>(params.has_initial_state_ptr)[batch_id];
|
||||||
|
|
||||||
|
int* cache_indices = params.cache_indices_ptr == nullptr ? nullptr
|
||||||
|
: reinterpret_cast<int *>(params.cache_indices_ptr);
|
||||||
|
int cache_index = cache_indices == nullptr ? batch_id : cache_indices[batch_id];
|
||||||
|
|
||||||
|
input_t *conv_states = params.conv_states_ptr == nullptr ? nullptr
|
||||||
|
: reinterpret_cast<input_t *>(params.conv_states_ptr) + cache_index * params.conv_states_batch_stride + channel_id * params.conv_states_c_stride;
|
||||||
|
|
||||||
// Thread 0 will load the last elements of the previous chunk, so we initialize those to 0.
|
// Thread 0 will load the last elements of the previous chunk, so we initialize those to 0.
|
||||||
if (tidx == 0) {
|
if (tidx == 0) {
|
||||||
input_t zeros[kNElts] = {0};
|
input_t initial_state[kNElts] = {0};
|
||||||
smem_exchange[kNThreads - 1] = reinterpret_cast<vec_t *>(zeros)[0];
|
if (has_initial_state) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int w = 0; w < kWidth - 1; ++w){ initial_state[kNElts - 1 - (kWidth - 2) + w ] = conv_states[w]; }
|
||||||
|
}
|
||||||
|
smem_exchange[kNThreads - 1] = reinterpret_cast<vec_t *>(initial_state)[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
float weight_vals[kWidth];
|
float weight_vals[kWidth];
|
||||||
@@ -330,14 +359,14 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
|
|||||||
for (int i = 0; i < kWidth; ++i) { weight_vals[i] = float(weight[i * params.weight_width_stride]); }
|
for (int i = 0; i < kWidth; ++i) { weight_vals[i] = float(weight[i * params.weight_width_stride]); }
|
||||||
|
|
||||||
constexpr int kChunkSize = kNThreads * kNElts;
|
constexpr int kChunkSize = kNThreads * kNElts;
|
||||||
const int n_chunks = (params.seqlen + kChunkSize - 1) / kChunkSize;
|
const int n_chunks = (seqlen + kChunkSize - 1) / kChunkSize;
|
||||||
for (int chunk = 0; chunk < n_chunks; ++chunk) {
|
for (int chunk = 0; chunk < n_chunks; ++chunk) {
|
||||||
input_t x_vals_load[2 * kNElts] = {0};
|
input_t x_vals_load[2 * kNElts] = {0};
|
||||||
if constexpr(kIsVecLoad) {
|
if constexpr(kIsVecLoad) {
|
||||||
typename Ktraits::BlockLoadVecT(smem_load_vec).Load(reinterpret_cast<vec_t*>(x), *reinterpret_cast<vec_t (*)[1]>(&x_vals_load[kNElts]), (params.seqlen - chunk * kChunkSize) / kNElts);
|
typename Ktraits::BlockLoadVecT(smem_load_vec).Load(reinterpret_cast<vec_t*>(x), *reinterpret_cast<vec_t (*)[1]>(&x_vals_load[kNElts]), (seqlen - chunk * kChunkSize) / kNElts);
|
||||||
} else {
|
} else {
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
typename Ktraits::BlockLoadT(smem_load).Load(x, *reinterpret_cast<input_t (*)[kNElts]>(&x_vals_load[kNElts]), params.seqlen - chunk * kChunkSize);
|
typename Ktraits::BlockLoadT(smem_load).Load(x, *reinterpret_cast<input_t (*)[kNElts]>(&x_vals_load[kNElts]), seqlen - chunk * kChunkSize);
|
||||||
}
|
}
|
||||||
x += kChunkSize;
|
x += kChunkSize;
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
@@ -375,19 +404,57 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
|
|||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < kNElts; ++i) { out_vals_store[i] = out_vals[i]; }
|
for (int i = 0; i < kNElts; ++i) { out_vals_store[i] = out_vals[i]; }
|
||||||
if constexpr(kIsVecLoad) {
|
if constexpr(kIsVecLoad) {
|
||||||
typename Ktraits::BlockStoreVecT(smem_store_vec).Store(reinterpret_cast<vec_t*>(out), reinterpret_cast<vec_t (&)[1]>(out_vals_store), (params.seqlen - chunk * kChunkSize) / kNElts);
|
typename Ktraits::BlockStoreVecT(smem_store_vec).Store(reinterpret_cast<vec_t*>(out), reinterpret_cast<vec_t (&)[1]>(out_vals_store), (seqlen - chunk * kChunkSize) / kNElts);
|
||||||
} else {
|
} else {
|
||||||
typename Ktraits::BlockStoreT(smem_store).Store(out, out_vals_store, params.seqlen - chunk * kChunkSize);
|
typename Ktraits::BlockStoreT(smem_store).Store(out, out_vals_store, seqlen - chunk * kChunkSize);
|
||||||
}
|
}
|
||||||
out += kChunkSize;
|
out += kChunkSize;
|
||||||
}
|
}
|
||||||
|
// Final state is stored in the smem_exchange last token slot,
|
||||||
|
// in case seqlen < kWidth, we would need to take the final state from the
|
||||||
|
// initial state which is stored in conv_states
|
||||||
|
// in case seqlen > kWidth, we would need to load the last kWidth - 1 data
|
||||||
|
// and load it into conv_state accordingly
|
||||||
|
int last_thread = ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize) / kNElts;
|
||||||
|
if (conv_states != nullptr && tidx == last_thread) {
|
||||||
|
input_t x_vals_load[kNElts * 2] = {0};
|
||||||
|
// in case we are on the first kWidth tokens
|
||||||
|
if (last_thread == 0 && seqlen < kWidth){
|
||||||
|
// Need to take the initial state
|
||||||
|
reinterpret_cast<vec_t *>(x_vals_load)[0] = smem_exchange[0];
|
||||||
|
const int offset = seqlen - (kWidth - 1);
|
||||||
|
#pragma unroll
|
||||||
|
for (int w = 0; w < kWidth - 1; ++w){
|
||||||
|
// pad the existing state
|
||||||
|
if ((w - seqlen) >= 0 && has_initial_state) { conv_states[w - seqlen] = conv_states[w]; }
|
||||||
|
else if ((w - seqlen) >= 0 && !has_initial_state) { conv_states[w - seqlen] = input_t(0.0f); }
|
||||||
|
}
|
||||||
|
#pragma unroll
|
||||||
|
for (int w = 0; w < kWidth - 1; ++w){
|
||||||
|
if (offset + w >= 0)
|
||||||
|
conv_states[w] = x_vals_load[offset + w ];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// in case the final state is in between the threads data
|
||||||
|
reinterpret_cast<vec_t *>(x_vals_load)[1] = smem_exchange[last_thread + 1];
|
||||||
|
reinterpret_cast<vec_t *>(x_vals_load)[0] = smem_exchange[last_thread];
|
||||||
|
const int offset = ((seqlen - (kWidth - 1)) % (kNElts));
|
||||||
|
#pragma unroll
|
||||||
|
for (int w = 0; w < kWidth - 1; ++w){
|
||||||
|
conv_states[w] = x_vals_load[offset + w ];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<int kNThreads, int kWidth, typename input_t, typename weight_t>
|
template<int kNThreads, int kWidth, typename input_t, typename weight_t>
|
||||||
void causal_conv1d_fwd_launch(ConvParamsBase ¶ms, cudaStream_t stream) {
|
void causal_conv1d_fwd_launch(ConvParamsBase ¶ms, cudaStream_t stream) {
|
||||||
static constexpr int kNElts = sizeof(input_t) == 4 ? 4 : 8;
|
static constexpr int kNElts = sizeof(input_t) == 4 ? 4 : 8;
|
||||||
BOOL_SWITCH(params.seqlen % kNElts == 0, kIsVecLoad, [&] {
|
const bool kVarlen = params.query_start_loc_ptr != nullptr;
|
||||||
|
BOOL_SWITCH(params.seqlen % kNElts == 0 && !kVarlen, kIsVecLoad, [&] {
|
||||||
using Ktraits = Causal_conv1d_fwd_kernel_traits<kNThreads, kWidth, kIsVecLoad, input_t, weight_t>;
|
using Ktraits = Causal_conv1d_fwd_kernel_traits<kNThreads, kWidth, kIsVecLoad, input_t, weight_t>;
|
||||||
constexpr int kSmemSize = Ktraits::kSmemSize;
|
constexpr int kSmemSize = Ktraits::kSmemSize;
|
||||||
dim3 grid(params.batch, params.dim);
|
dim3 grid(params.batch, params.dim);
|
||||||
@@ -422,220 +489,11 @@ void causal_conv1d_fwd_cuda(ConvParamsBase ¶ms, cudaStream_t stream) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<int kNThreads_, int kWidth_, int kChunkSizeL_, bool kIsVecLoad_, typename input_t_, typename weight_t_>
|
|
||||||
struct Causal_conv1d_channellast_fwd_kernel_traits {
|
|
||||||
// The cache line is 128 bytes, and we try to read 16 bytes per thread.
|
|
||||||
// So we have 8 threads per "row", so 32 or 64 elements in the channel dimension.
|
|
||||||
// That leaves 4 columns per warp, and so 16 columns per block (assuming each block has 128
|
|
||||||
// threads). Each each load is 16 x 32|64 elements in the L x C dimensions.
|
|
||||||
using input_t = input_t_;
|
|
||||||
using weight_t = weight_t_;
|
|
||||||
static constexpr int kNThreads = kNThreads_;
|
|
||||||
static_assert(kNThreads % 32 == 0);
|
|
||||||
static constexpr int kNWarps = kNThreads / 32;
|
|
||||||
static constexpr int kWidth = kWidth_;
|
|
||||||
static constexpr int kChunkSizeL = kChunkSizeL_;
|
|
||||||
static constexpr int kNBytes = sizeof(input_t);
|
|
||||||
static_assert(kNBytes == 2 || kNBytes == 4);
|
|
||||||
static constexpr int kNElts = kNBytes == 4 ? 4 : 8;
|
|
||||||
static constexpr int kNEltsPerRow = 128 / kNBytes;
|
|
||||||
static constexpr int kNThreadsPerRow = kNEltsPerRow / kNElts; // Always 8 for now
|
|
||||||
static_assert(kNThreadsPerRow * kNBytes * kNElts == 128);
|
|
||||||
static constexpr int kNColsPerWarp = 32 / kNThreadsPerRow; // Always 4 for now
|
|
||||||
static_assert(kNColsPerWarp * kNThreadsPerRow == 32);
|
|
||||||
static constexpr int kNColsPerLoad = kNColsPerWarp * kNWarps;
|
|
||||||
static constexpr int kNLoads = kChunkSizeL / kNColsPerLoad;
|
|
||||||
static_assert(kNLoads * kNColsPerLoad == kChunkSizeL);
|
|
||||||
static constexpr bool kIsVecLoad = kIsVecLoad_;
|
|
||||||
using vec_t = typename BytesToType<kNBytes * kNElts>::Type;
|
|
||||||
// using BlockLoadT = cub::BlockLoad<input_t, kNThreads, kNItems, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
|
|
||||||
// using BlockStoreT = cub::BlockStore<input_t, kNThreads, kNItems, cub::BLOCK_STORE_WARP_TRANSPOSE>;
|
|
||||||
// static constexpr int kSmemSize = std::max({sizeof(typename BlockLoadT::TempStorage),
|
|
||||||
// sizeof(typename BlockStoreT::TempStorage)});
|
|
||||||
// static constexpr int kSmemSize = kChunkSizeL * kNEltsPerRow * kNBytes;
|
|
||||||
};
|
|
||||||
|
|
||||||
template<typename Ktraits, bool kHasSeqIdx>
|
|
||||||
__global__ __launch_bounds__(Ktraits::kNThreads)
|
|
||||||
void causal_conv1d_channellast_fwd_kernel(ConvParamsBase params) {
|
|
||||||
constexpr int kWidth = Ktraits::kWidth;
|
|
||||||
constexpr int kNThreads = Ktraits::kNThreads;
|
|
||||||
constexpr int kNElts = Ktraits::kNElts;
|
|
||||||
constexpr int kNThreadsPerC = Ktraits::kNThreadsPerRow;
|
|
||||||
constexpr int kLPerLoad = Ktraits::kNColsPerLoad;
|
|
||||||
constexpr int kChunkSizeL = Ktraits::kChunkSizeL;
|
|
||||||
constexpr int kChunkSizeC = Ktraits::kNEltsPerRow;
|
|
||||||
using input_t = typename Ktraits::input_t;
|
|
||||||
using vec_t = typename Ktraits::vec_t;
|
|
||||||
using weight_t = typename Ktraits::weight_t;
|
|
||||||
|
|
||||||
// Shared memory.
|
|
||||||
__shared__ input_t x_smem[kWidth - 1 + kChunkSizeL][kChunkSizeC + kNElts];
|
|
||||||
|
|
||||||
const int batch_id = blockIdx.x;
|
|
||||||
const int chunk_l_id = blockIdx.y;
|
|
||||||
const int chunk_c_id = blockIdx.z;
|
|
||||||
const int tid = threadIdx.x;
|
|
||||||
const int l_idx = tid / kNThreadsPerC;
|
|
||||||
const int c_idx = tid % kNThreadsPerC;
|
|
||||||
input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + batch_id * params.x_batch_stride
|
|
||||||
+ (chunk_l_id * kChunkSizeL + l_idx) * params.x_l_stride + chunk_c_id * kChunkSizeC + c_idx * kNElts;
|
|
||||||
weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr)
|
|
||||||
+ chunk_c_id * kChunkSizeC * params.weight_c_stride;
|
|
||||||
input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + batch_id * params.out_batch_stride
|
|
||||||
+ (chunk_l_id * kChunkSizeL + l_idx) * params.out_l_stride + chunk_c_id * kChunkSizeC + c_idx * kNElts;
|
|
||||||
int *seq_idx = !kHasSeqIdx ? nullptr : reinterpret_cast<int *>(params.seq_idx_ptr)
|
|
||||||
+ batch_id * params.seqlen + chunk_l_id * kChunkSizeL;
|
|
||||||
input_t *initial_states = params.initial_states_ptr == nullptr || chunk_l_id > 0 ? nullptr
|
|
||||||
: reinterpret_cast<input_t *>(params.initial_states_ptr) + batch_id * params.initial_states_batch_stride + l_idx * params.initial_states_l_stride + chunk_c_id * kChunkSizeC + c_idx * kNElts;
|
|
||||||
// The last L-chunk will also have enough info to write to final states, since it also contain a few x values
|
|
||||||
// from the previous L-chunk.
|
|
||||||
input_t *final_states = params.final_states_ptr == nullptr || chunk_l_id < gridDim.y - 1 ? nullptr
|
|
||||||
: reinterpret_cast<input_t *>(params.final_states_ptr) + batch_id * params.final_states_batch_stride + l_idx * params.final_states_l_stride + chunk_c_id * kChunkSizeC + c_idx * kNElts;
|
|
||||||
|
|
||||||
#pragma unroll
|
|
||||||
for (int l = 0; l < Ktraits::kNLoads; ++l) {
|
|
||||||
input_t x_vals_load[kNElts] = {0};
|
|
||||||
if (chunk_l_id * kChunkSizeL + l * kLPerLoad + l_idx < params.seqlen
|
|
||||||
&& chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
|
|
||||||
reinterpret_cast<vec_t *>(x_vals_load)[0] = *reinterpret_cast<vec_t *>(x + l * kLPerLoad * params.x_l_stride);
|
|
||||||
}
|
|
||||||
reinterpret_cast<vec_t *>(x_smem[kWidth - 1 + l * kLPerLoad + l_idx])[c_idx] = reinterpret_cast<vec_t *>(x_vals_load)[0];
|
|
||||||
}
|
|
||||||
// Load the elements from the previous chunk that are needed for convolution.
|
|
||||||
if (l_idx < kWidth - 1) {
|
|
||||||
input_t x_vals_load[kNElts] = {0};
|
|
||||||
if (chunk_l_id * kChunkSizeL + l_idx - (kWidth - 1) >= 0
|
|
||||||
&& chunk_l_id * kChunkSizeL + l_idx - (kWidth - 1) < params.seqlen
|
|
||||||
&& chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
|
|
||||||
reinterpret_cast<vec_t *>(x_vals_load)[0] = *reinterpret_cast<vec_t *>(x - (kWidth - 1) * params.x_l_stride);
|
|
||||||
} else if (initial_states != nullptr
|
|
||||||
&& chunk_l_id * kChunkSizeL + l_idx - (kWidth - 1) < 0
|
|
||||||
&& chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
|
|
||||||
reinterpret_cast<vec_t *>(x_vals_load)[0] = *reinterpret_cast<vec_t *>(initial_states);
|
|
||||||
}
|
|
||||||
reinterpret_cast<vec_t *>(x_smem[l_idx])[c_idx] = reinterpret_cast<vec_t *>(x_vals_load)[0];
|
|
||||||
}
|
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
if (final_states != nullptr
|
|
||||||
&& l_idx < kWidth - 1
|
|
||||||
&& chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
|
|
||||||
// x_smem[0] contains element at index chunk_l_id * kChunkSizeL - (kWidth - 1)
|
|
||||||
// So last few elements (index params.seqlen - kWidth + 1 + l_idx) are stored in x_smem[params.seqlen - kWidth + 1 + l_idx - (chunk_l_id * kChunkSizeL - kWidth + 1)][c_idx]
|
|
||||||
*reinterpret_cast<vec_t *>(final_states) = reinterpret_cast<vec_t *>(x_smem[params.seqlen + l_idx - chunk_l_id * kChunkSizeL])[c_idx];
|
|
||||||
}
|
|
||||||
|
|
||||||
constexpr int kLPerThread = constexpr_min(kChunkSizeL * kChunkSizeC / kNThreads, kChunkSizeL);
|
|
||||||
static_assert(kLPerThread * kNThreads == kChunkSizeL * kChunkSizeC);
|
|
||||||
constexpr int kNThreadsPerRow = kChunkSizeL / kLPerThread;
|
|
||||||
static_assert(kNThreadsPerRow * kLPerThread == kChunkSizeL);
|
|
||||||
// kChunkSizeL, kLPerThread, kNThreadsPerRow should be powers of 2 for simplicity
|
|
||||||
static_assert((kChunkSizeL & (kChunkSizeL - 1)) == 0);
|
|
||||||
static_assert((kLPerThread & (kLPerThread - 1)) == 0);
|
|
||||||
static_assert((kNThreadsPerRow & (kNThreadsPerRow - 1)) == 0);
|
|
||||||
static_assert(kNThreadsPerRow <= 32);
|
|
||||||
|
|
||||||
const int row_idx = tid / kNThreadsPerRow;
|
|
||||||
const int col_idx = tid % kNThreadsPerRow;
|
|
||||||
|
|
||||||
float bias_val = params.bias_ptr == nullptr || chunk_c_id * kChunkSizeC + row_idx >= params.dim ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[chunk_c_id * kChunkSizeC + row_idx]);
|
|
||||||
float weight_vals[kWidth] = {0};
|
|
||||||
if (chunk_c_id * kChunkSizeC + row_idx < params.dim) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int w = 0; w < kWidth; ++w) {
|
|
||||||
weight_vals[w] = weight[row_idx * params.weight_c_stride + w * params.weight_width_stride];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
float x_vals[kWidth - 1 + kLPerThread];
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < kWidth - 1 + kLPerThread; ++i) {
|
|
||||||
x_vals[i] = float(x_smem[col_idx * kLPerThread + i][row_idx]);
|
|
||||||
}
|
|
||||||
int seq_idx_thread[kWidth - 1 + kLPerThread];
|
|
||||||
if constexpr (kHasSeqIdx) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < kWidth - 1 + kLPerThread; ++i) {
|
|
||||||
seq_idx_thread[i] = chunk_l_id * kChunkSizeL + col_idx * kLPerThread + i - (kWidth - 1) >= 0 ? seq_idx[col_idx * kLPerThread + i - (kWidth - 1)] : -1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
float out_vals[kLPerThread];
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < kLPerThread; ++i) {
|
|
||||||
out_vals[i] = bias_val;
|
|
||||||
const int seq_idx_cur = !kHasSeqIdx ? 0 : seq_idx_thread[i + kWidth - 1];
|
|
||||||
#pragma unroll
|
|
||||||
for (int w = 0; w < kWidth; ++w) {
|
|
||||||
if constexpr (!kHasSeqIdx) {
|
|
||||||
out_vals[i] += weight_vals[w] * x_vals[i + w];
|
|
||||||
} else {
|
|
||||||
out_vals[i] += seq_idx_thread[i + w] == seq_idx_cur ? weight_vals[w] * x_vals[i + w] : 0.f;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (params.silu_activation) {out_vals[i] = out_vals[i] / (1 + expf(-out_vals[i])); }
|
|
||||||
}
|
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < kLPerThread; ++i) { x_smem[col_idx * kLPerThread + i][row_idx] = out_vals[i]; }
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
#pragma unroll
|
|
||||||
for (int l = 0; l < Ktraits::kNLoads; ++l) {
|
|
||||||
input_t out_vals_store[kNElts];
|
|
||||||
reinterpret_cast<vec_t *>(out_vals_store)[0] = reinterpret_cast<vec_t *>(x_smem[l * kLPerLoad + l_idx])[c_idx];
|
|
||||||
if (chunk_l_id * kChunkSizeL + l * kLPerLoad + l_idx < params.seqlen
|
|
||||||
&& chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
|
|
||||||
*reinterpret_cast<vec_t *>(out + l * kLPerLoad * params.out_l_stride) = reinterpret_cast<vec_t *>(out_vals_store)[0];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
template<int kNThreads, int kWidth, typename input_t, typename weight_t>
|
|
||||||
void causal_conv1d_channellast_fwd_launch(ConvParamsBase ¶ms, cudaStream_t stream) {
|
|
||||||
BOOL_SWITCH(params.seq_idx_ptr != nullptr, kHasSeqIdx, [&] {
|
|
||||||
using Ktraits = Causal_conv1d_channellast_fwd_kernel_traits<kNThreads, kWidth, 64, true, input_t, weight_t>;
|
|
||||||
// constexpr int kSmemSize = Ktraits::kSmemSize;
|
|
||||||
constexpr int kChunkSizeL = Ktraits::kChunkSizeL;
|
|
||||||
constexpr int kChunkSizeC = Ktraits::kNEltsPerRow;
|
|
||||||
const int n_chunks_L = (params.seqlen + kChunkSizeL - 1) / kChunkSizeL;
|
|
||||||
const int n_chunks_C = (params.dim + kChunkSizeC - 1) / kChunkSizeC;
|
|
||||||
dim3 grid(params.batch, n_chunks_L, n_chunks_C);
|
|
||||||
dim3 block(Ktraits::kNThreads);
|
|
||||||
auto kernel = &causal_conv1d_channellast_fwd_kernel<Ktraits, kHasSeqIdx>;
|
|
||||||
// if (kSmemSize >= 48 * 1024) {
|
|
||||||
// C10_CUDA_CHECK(cudaFuncSetAttribute(
|
|
||||||
// kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
|
|
||||||
// }
|
|
||||||
// kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
|
|
||||||
kernel<<<grid, Ktraits::kNThreads, 0, stream>>>(params);
|
|
||||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename input_t, typename weight_t>
|
|
||||||
void causal_conv1d_channellast_fwd_cuda(ConvParamsBase ¶ms, cudaStream_t stream) {
|
|
||||||
if (params.width == 2) {
|
|
||||||
causal_conv1d_channellast_fwd_launch<128, 2, input_t, weight_t>(params, stream);
|
|
||||||
} else if (params.width == 3) {
|
|
||||||
causal_conv1d_channellast_fwd_launch<128, 3, input_t, weight_t>(params, stream);
|
|
||||||
} else if (params.width == 4) {
|
|
||||||
causal_conv1d_channellast_fwd_launch<128, 4, input_t, weight_t>(params, stream);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template void causal_conv1d_fwd_cuda<float, float>(ConvParamsBase ¶ms, cudaStream_t stream);
|
template void causal_conv1d_fwd_cuda<float, float>(ConvParamsBase ¶ms, cudaStream_t stream);
|
||||||
template void causal_conv1d_fwd_cuda<at::Half, at::Half>(ConvParamsBase ¶ms, cudaStream_t stream);
|
template void causal_conv1d_fwd_cuda<at::Half, at::Half>(ConvParamsBase ¶ms, cudaStream_t stream);
|
||||||
template void causal_conv1d_fwd_cuda<at::BFloat16, at::BFloat16>(ConvParamsBase ¶ms, cudaStream_t stream);
|
template void causal_conv1d_fwd_cuda<at::BFloat16, at::BFloat16>(ConvParamsBase ¶ms, cudaStream_t stream);
|
||||||
|
|
||||||
template void causal_conv1d_channellast_fwd_cuda<float, float>(ConvParamsBase ¶ms, cudaStream_t stream);
|
|
||||||
template void causal_conv1d_channellast_fwd_cuda<at::Half, at::Half>(ConvParamsBase ¶ms, cudaStream_t stream);
|
|
||||||
template void causal_conv1d_channellast_fwd_cuda<at::BFloat16, at::BFloat16>(ConvParamsBase ¶ms, cudaStream_t stream);
|
|
||||||
///////
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -649,7 +507,7 @@ struct Causal_conv1d_update_kernel_traits {
|
|||||||
static_assert(kNBytes == 2 || kNBytes == 4);
|
static_assert(kNBytes == 2 || kNBytes == 4);
|
||||||
};
|
};
|
||||||
|
|
||||||
template<typename Ktraits>
|
template<typename Ktraits, bool kIsCircularBuffer>
|
||||||
__global__ __launch_bounds__(Ktraits::kNThreads)
|
__global__ __launch_bounds__(Ktraits::kNThreads)
|
||||||
void causal_conv1d_update_kernel(ConvParamsBase params) {
|
void causal_conv1d_update_kernel(ConvParamsBase params) {
|
||||||
constexpr int kWidth = Ktraits::kWidth;
|
constexpr int kWidth = Ktraits::kWidth;
|
||||||
@@ -660,6 +518,8 @@ void causal_conv1d_update_kernel(ConvParamsBase params) {
|
|||||||
const int tidx = threadIdx.x;
|
const int tidx = threadIdx.x;
|
||||||
const int batch_id = blockIdx.x;
|
const int batch_id = blockIdx.x;
|
||||||
const int channel_id = blockIdx.y * kNThreads + tidx;
|
const int channel_id = blockIdx.y * kNThreads + tidx;
|
||||||
|
if (channel_id >= params.dim) return;
|
||||||
|
|
||||||
input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + batch_id * params.x_batch_stride
|
input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + batch_id * params.x_batch_stride
|
||||||
+ channel_id * params.x_c_stride;
|
+ channel_id * params.x_c_stride;
|
||||||
|
|
||||||
@@ -675,35 +535,70 @@ void causal_conv1d_update_kernel(ConvParamsBase params) {
|
|||||||
weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr) + channel_id * params.weight_c_stride;
|
weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr) + channel_id * params.weight_c_stride;
|
||||||
input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + batch_id * params.out_batch_stride
|
input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + batch_id * params.out_batch_stride
|
||||||
+ channel_id * params.out_c_stride;
|
+ channel_id * params.out_c_stride;
|
||||||
float bias_val = params.bias_ptr == nullptr || channel_id >= params.dim ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[channel_id]);
|
float bias_val = params.bias_ptr == nullptr ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[channel_id]);
|
||||||
|
|
||||||
|
int state_len = params.conv_state_len;
|
||||||
|
int advance_len = params.seqlen;
|
||||||
|
int cache_seqlen = kIsCircularBuffer ? params.cache_seqlens[batch_id] % state_len : 0;
|
||||||
|
int update_idx = cache_seqlen - (kWidth - 1);
|
||||||
|
update_idx = update_idx < 0 ? update_idx + state_len : update_idx;
|
||||||
|
|
||||||
float weight_vals[kWidth] = {0};
|
float weight_vals[kWidth] = {0};
|
||||||
if (channel_id < params.dim) {
|
#pragma unroll
|
||||||
#pragma unroll
|
for (int i = 0; i < kWidth; ++i) { weight_vals[i] = float(weight[i * params.weight_width_stride]); }
|
||||||
for (int i = 0; i < kWidth; ++i) { weight_vals[i] = float(weight[i * params.weight_width_stride]); }
|
|
||||||
}
|
|
||||||
|
|
||||||
float x_vals[kWidth] = {0};
|
float x_vals[kWidth] = {0};
|
||||||
if (channel_id < params.dim) {
|
if constexpr (!kIsCircularBuffer) {
|
||||||
|
#pragma unroll 2
|
||||||
|
for (int i = 0; i < state_len - advance_len - (kWidth - 1); ++i) {
|
||||||
|
conv_state[i * params.conv_state_l_stride] = conv_state[(i + advance_len) * params.conv_state_l_stride];
|
||||||
|
}
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < kWidth - 1; ++i) { x_vals[i] = float(conv_state[(i + 1) * params.conv_state_l_stride]); }
|
for (int i = 0; i < kWidth - 1; ++i) {
|
||||||
x_vals[kWidth - 1] = float(x[0]);
|
input_t state_val = conv_state[(state_len - (kWidth - 1) + i) * params.conv_state_l_stride];
|
||||||
|
if (i < advance_len + (kWidth - 1) && state_len - advance_len - (kWidth - 1) + i >= 0) {
|
||||||
|
conv_state[(state_len - advance_len - (kWidth - 1) + i) * params.conv_state_l_stride] = state_val;
|
||||||
|
}
|
||||||
|
x_vals[i] = float(state_val);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < kWidth; ++i) { conv_state[i * params.conv_state_l_stride] = input_t(x_vals[i]); }
|
for (int i = 0; i < kWidth - 1; ++i, update_idx = update_idx + 1 >= state_len ? update_idx + 1 - state_len : update_idx + 1) {
|
||||||
|
input_t state_val = conv_state[update_idx * params.conv_state_l_stride];
|
||||||
|
x_vals[i] = float(state_val);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#pragma unroll 2
|
||||||
|
for (int i = 0; i < params.seqlen; ++i) {
|
||||||
|
input_t x_val = x[i * params.x_l_stride];
|
||||||
|
if constexpr (!kIsCircularBuffer) {
|
||||||
|
if (i < advance_len && state_len - advance_len + i >= 0) {
|
||||||
|
conv_state[(state_len - advance_len + i) * params.conv_state_l_stride] = x_val;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
conv_state[update_idx * params.conv_state_l_stride] = x_val;
|
||||||
|
++update_idx;
|
||||||
|
update_idx = update_idx >= state_len ? update_idx - state_len : update_idx;
|
||||||
|
}
|
||||||
|
x_vals[kWidth - 1] = float(x_val);
|
||||||
|
float out_val = bias_val;
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < kWidth; ++j) { out_val += weight_vals[j] * x_vals[j]; }
|
||||||
|
if (params.silu_activation) { out_val = out_val / (1 + expf(-out_val)); }
|
||||||
|
out[i * params.out_l_stride] = input_t(out_val);
|
||||||
|
// Shift the input buffer by 1
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < kWidth - 1; ++i) { x_vals[i] = x_vals[i + 1]; }
|
||||||
}
|
}
|
||||||
|
|
||||||
float out_val = bias_val;
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < kWidth; ++i) { out_val += weight_vals[i] * x_vals[i]; }
|
|
||||||
if (params.silu_activation) { out_val = out_val / (1 + expf(-out_val)); }
|
|
||||||
if (channel_id < params.dim) { out[0] = input_t(out_val); }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<int kNThreads, int kWidth, typename input_t, typename weight_t>
|
template<int kNThreads, int kWidth, typename input_t, typename weight_t>
|
||||||
void causal_conv1d_update_launch(ConvParamsBase ¶ms, cudaStream_t stream) {
|
void causal_conv1d_update_launch(ConvParamsBase ¶ms, cudaStream_t stream) {
|
||||||
using Ktraits = Causal_conv1d_update_kernel_traits<kNThreads, kWidth, input_t, weight_t>;
|
using Ktraits = Causal_conv1d_update_kernel_traits<kNThreads, kWidth, input_t, weight_t>;
|
||||||
dim3 grid(params.batch, (params.dim + kNThreads - 1) / kNThreads);
|
dim3 grid(params.batch, (params.dim + kNThreads - 1) / kNThreads);
|
||||||
auto kernel = &causal_conv1d_update_kernel<Ktraits>;
|
auto kernel = params.cache_seqlens == nullptr
|
||||||
|
? &causal_conv1d_update_kernel<Ktraits, false>
|
||||||
|
: &causal_conv1d_update_kernel<Ktraits, true>;
|
||||||
kernel<<<grid, Ktraits::kNThreads, 0, stream>>>(params);
|
kernel<<<grid, Ktraits::kNThreads, 0, stream>>>(params);
|
||||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ struct ConvParamsBase {
|
|||||||
index_t out_c_stride;
|
index_t out_c_stride;
|
||||||
index_t out_l_stride;
|
index_t out_l_stride;
|
||||||
|
|
||||||
|
int conv_state_len;
|
||||||
index_t conv_state_batch_stride;
|
index_t conv_state_batch_stride;
|
||||||
index_t conv_state_c_stride;
|
index_t conv_state_c_stride;
|
||||||
index_t conv_state_l_stride;
|
index_t conv_state_l_stride;
|
||||||
@@ -35,6 +36,10 @@ struct ConvParamsBase {
|
|||||||
void *__restrict__ out_ptr;
|
void *__restrict__ out_ptr;
|
||||||
|
|
||||||
void *__restrict__ conv_state_ptr;
|
void *__restrict__ conv_state_ptr;
|
||||||
|
void *__restrict__ query_start_loc_ptr;
|
||||||
|
void *__restrict__ has_initial_state_ptr;
|
||||||
|
void *__restrict__ cache_indices_ptr;
|
||||||
|
int32_t *__restrict__ cache_seqlens;
|
||||||
|
|
||||||
// For the continuous batching case. Makes it so that the mamba state for
|
// For the continuous batching case. Makes it so that the mamba state for
|
||||||
// the current batch doesn't need to be a contiguous tensor.
|
// the current batch doesn't need to be a contiguous tensor.
|
||||||
@@ -52,6 +57,11 @@ struct ConvParamsBase {
|
|||||||
index_t final_states_batch_stride;
|
index_t final_states_batch_stride;
|
||||||
index_t final_states_l_stride;
|
index_t final_states_l_stride;
|
||||||
index_t final_states_c_stride;
|
index_t final_states_c_stride;
|
||||||
|
|
||||||
|
void * conv_states_ptr;
|
||||||
|
index_t conv_states_batch_stride;
|
||||||
|
index_t conv_states_l_stride;
|
||||||
|
index_t conv_states_c_stride;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -54,10 +54,14 @@ struct SSMParamsBase {
|
|||||||
void *__restrict__ delta_ptr;
|
void *__restrict__ delta_ptr;
|
||||||
void *__restrict__ delta_bias_ptr;
|
void *__restrict__ delta_bias_ptr;
|
||||||
void *__restrict__ out_ptr;
|
void *__restrict__ out_ptr;
|
||||||
void *__restrict__ x_ptr;
|
void *__restrict__ ssm_states_ptr;
|
||||||
void *__restrict__ z_ptr;
|
void *__restrict__ z_ptr;
|
||||||
void *__restrict__ out_z_ptr;
|
void *__restrict__ out_z_ptr;
|
||||||
void *__restrict__ index_ptr;
|
|
||||||
|
void *__restrict__ query_start_loc_ptr;
|
||||||
|
void *__restrict__ cache_indices_ptr;
|
||||||
|
void *__restrict__ has_initial_state_ptr;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@@ -201,7 +205,7 @@ inline __device__ void load_input(typename Ktraits::input_t *u,
|
|||||||
typename Ktraits::input_t (&u_vals)[Ktraits::kNItems],
|
typename Ktraits::input_t (&u_vals)[Ktraits::kNItems],
|
||||||
typename Ktraits::BlockLoadT::TempStorage &smem_load,
|
typename Ktraits::BlockLoadT::TempStorage &smem_load,
|
||||||
int seqlen) {
|
int seqlen) {
|
||||||
if constexpr (Ktraits::kIsEvenLen) {
|
if constexpr (Ktraits::kIsEvenLen && !Ktraits::kVarlen) {
|
||||||
auto& smem_load_vec = reinterpret_cast<typename Ktraits::BlockLoadVecT::TempStorage&>(smem_load);
|
auto& smem_load_vec = reinterpret_cast<typename Ktraits::BlockLoadVecT::TempStorage&>(smem_load);
|
||||||
using vec_t = typename Ktraits::vec_t;
|
using vec_t = typename Ktraits::vec_t;
|
||||||
typename Ktraits::BlockLoadVecT(smem_load_vec).Load(
|
typename Ktraits::BlockLoadVecT(smem_load_vec).Load(
|
||||||
@@ -217,21 +221,6 @@ inline __device__ void load_input(typename Ktraits::input_t *u,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename Ktraits>
|
|
||||||
inline __device__ void load_index(int *u,
|
|
||||||
int (&u_vals)[Ktraits::kNItems],
|
|
||||||
typename Ktraits::BlockLoadIndexT::TempStorage &smem_load_index,
|
|
||||||
int seqlen) {
|
|
||||||
if constexpr (Ktraits::kIsEvenLen) {
|
|
||||||
auto& smem_load_index_vec = reinterpret_cast<typename Ktraits::BlockLoadIndexVecT::TempStorage&>(smem_load_index);
|
|
||||||
Ktraits::BlockLoadIndexVecT(smem_load_index_vec).Load(
|
|
||||||
reinterpret_cast<uint4*>(u),
|
|
||||||
reinterpret_cast<uint4(&)[Ktraits::kNLoadsIndex]>(u_vals)
|
|
||||||
);
|
|
||||||
} else {
|
|
||||||
Ktraits::BlockLoadIndexT(smem_load_index).Load(u, u_vals, seqlen, 0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename Ktraits>
|
template<typename Ktraits>
|
||||||
inline __device__ void load_weight(typename Ktraits::input_t *Bvar,
|
inline __device__ void load_weight(typename Ktraits::input_t *Bvar,
|
||||||
@@ -240,7 +229,7 @@ inline __device__ void load_weight(typename Ktraits::input_t *Bvar,
|
|||||||
int seqlen) {
|
int seqlen) {
|
||||||
constexpr int kNItems = Ktraits::kNItems;
|
constexpr int kNItems = Ktraits::kNItems;
|
||||||
typename Ktraits::input_t B_vals_load[kNItems];
|
typename Ktraits::input_t B_vals_load[kNItems];
|
||||||
if constexpr (Ktraits::kIsEvenLen) {
|
if constexpr (Ktraits::kIsEvenLen && !Ktraits::kVarlen) {
|
||||||
auto& smem_load_weight_vec = reinterpret_cast<typename Ktraits::BlockLoadWeightVecT::TempStorage&>(smem_load_weight);
|
auto& smem_load_weight_vec = reinterpret_cast<typename Ktraits::BlockLoadWeightVecT::TempStorage&>(smem_load_weight);
|
||||||
using vec_t = typename Ktraits::vec_t;
|
using vec_t = typename Ktraits::vec_t;
|
||||||
typename Ktraits::BlockLoadWeightVecT(smem_load_weight_vec).Load(
|
typename Ktraits::BlockLoadWeightVecT(smem_load_weight_vec).Load(
|
||||||
@@ -263,7 +252,7 @@ inline __device__ void store_output(typename Ktraits::input_t *out,
|
|||||||
typename Ktraits::input_t write_vals[Ktraits::kNItems];
|
typename Ktraits::input_t write_vals[Ktraits::kNItems];
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < Ktraits::kNItems; ++i) { write_vals[i] = out_vals[i]; }
|
for (int i = 0; i < Ktraits::kNItems; ++i) { write_vals[i] = out_vals[i]; }
|
||||||
if constexpr (Ktraits::kIsEvenLen) {
|
if constexpr (Ktraits::kIsEvenLen && !Ktraits::kVarlen) {
|
||||||
auto& smem_store_vec = reinterpret_cast<typename Ktraits::BlockStoreVecT::TempStorage&>(smem_store);
|
auto& smem_store_vec = reinterpret_cast<typename Ktraits::BlockStoreVecT::TempStorage&>(smem_store);
|
||||||
using vec_t = typename Ktraits::vec_t;
|
using vec_t = typename Ktraits::vec_t;
|
||||||
typename Ktraits::BlockStoreVecT(smem_store_vec).Store(
|
typename Ktraits::BlockStoreVecT(smem_store_vec).Store(
|
||||||
|
|||||||
@@ -23,7 +23,7 @@
|
|||||||
|
|
||||||
template<int kNThreads_, int kNItems_, int kNRows_, bool kIsEvenLen_,
|
template<int kNThreads_, int kNItems_, int kNRows_, bool kIsEvenLen_,
|
||||||
bool kIsVariableB_, bool kIsVariableC_,
|
bool kIsVariableB_, bool kIsVariableC_,
|
||||||
bool kHasZ_, bool kUseIndex_, typename input_t_, typename weight_t_>
|
bool kHasZ_, bool kVarlen_, typename input_t_, typename weight_t_>
|
||||||
struct Selective_Scan_fwd_kernel_traits {
|
struct Selective_Scan_fwd_kernel_traits {
|
||||||
static_assert(kNItems_ % 4 == 0);
|
static_assert(kNItems_ % 4 == 0);
|
||||||
using input_t = input_t_;
|
using input_t = input_t_;
|
||||||
@@ -38,22 +38,19 @@ struct Selective_Scan_fwd_kernel_traits {
|
|||||||
static constexpr int kNElts = kNBytes == 4 ? 4 : constexpr_min(8, kNItems);
|
static constexpr int kNElts = kNBytes == 4 ? 4 : constexpr_min(8, kNItems);
|
||||||
static_assert(kNItems % kNElts == 0);
|
static_assert(kNItems % kNElts == 0);
|
||||||
static constexpr int kNLoads = kNItems / kNElts;
|
static constexpr int kNLoads = kNItems / kNElts;
|
||||||
static constexpr bool kIsEvenLen = kIsEvenLen_;
|
static constexpr bool kIsEvenLen = kVarlen_ ? false : kIsEvenLen_;
|
||||||
static constexpr bool kIsVariableB = kIsVariableB_;
|
static constexpr bool kIsVariableB = kIsVariableB_;
|
||||||
static constexpr bool kIsVariableC = kIsVariableC_;
|
static constexpr bool kIsVariableC = kIsVariableC_;
|
||||||
static constexpr bool kHasZ = kHasZ_;
|
static constexpr bool kHasZ = kHasZ_;
|
||||||
static constexpr bool kUseIndex = kUseIndex_;
|
static constexpr bool kVarlen = kVarlen_;
|
||||||
|
|
||||||
static constexpr bool kDirectIO = kIsEvenLen && kNLoads == 1;
|
static constexpr bool kDirectIO = kVarlen_ ? false : kIsEvenLen && kNLoads == 1;
|
||||||
static constexpr int kNLoadsIndex = kNItems / 4;
|
static constexpr int kNLoadsIndex = kNItems / 4;
|
||||||
using vec_t = typename BytesToType<kNBytes * kNElts>::Type;
|
using vec_t = typename BytesToType<kNBytes * kNElts>::Type;
|
||||||
using scan_t = float2;
|
using scan_t = float2;
|
||||||
using BlockLoadT = cub::BlockLoad<input_t, kNThreads, kNItems, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
|
using BlockLoadT = cub::BlockLoad<input_t, kNThreads, kNItems, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
|
||||||
using BlockLoadVecT = cub::BlockLoad<vec_t, kNThreads, kNLoads,
|
using BlockLoadVecT = cub::BlockLoad<vec_t, kNThreads, kNLoads,
|
||||||
!kDirectIO ? cub::BLOCK_LOAD_WARP_TRANSPOSE : cub::BLOCK_LOAD_DIRECT>;
|
!kDirectIO ? cub::BLOCK_LOAD_WARP_TRANSPOSE : cub::BLOCK_LOAD_DIRECT>;
|
||||||
using BlockLoadIndexT = cub::BlockLoad<int, kNThreads, kNItems, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
|
|
||||||
using BlockLoadIndexVecT = cub::BlockLoad<uint4, kNThreads, kNLoadsIndex,
|
|
||||||
!(kIsEvenLen && kNLoadsIndex == 1) ? cub::BLOCK_LOAD_WARP_TRANSPOSE : cub::BLOCK_LOAD_DIRECT>;
|
|
||||||
using BlockLoadWeightT = cub::BlockLoad<input_t, kNThreads, kNItems , cub::BLOCK_LOAD_WARP_TRANSPOSE>;
|
using BlockLoadWeightT = cub::BlockLoad<input_t, kNThreads, kNItems , cub::BLOCK_LOAD_WARP_TRANSPOSE>;
|
||||||
using BlockLoadWeightVecT = cub::BlockLoad<vec_t, kNThreads, kNLoads ,
|
using BlockLoadWeightVecT = cub::BlockLoad<vec_t, kNThreads, kNLoads ,
|
||||||
!kDirectIO ? cub::BLOCK_LOAD_WARP_TRANSPOSE : cub::BLOCK_LOAD_DIRECT>;
|
!kDirectIO ? cub::BLOCK_LOAD_WARP_TRANSPOSE : cub::BLOCK_LOAD_DIRECT>;
|
||||||
@@ -65,8 +62,6 @@ struct Selective_Scan_fwd_kernel_traits {
|
|||||||
using BlockScanT = cub::BlockScan<scan_t, kNThreads, cub::BLOCK_SCAN_WARP_SCANS>;
|
using BlockScanT = cub::BlockScan<scan_t, kNThreads, cub::BLOCK_SCAN_WARP_SCANS>;
|
||||||
static constexpr int kSmemIOSize = custom_max({sizeof(typename BlockLoadT::TempStorage),
|
static constexpr int kSmemIOSize = custom_max({sizeof(typename BlockLoadT::TempStorage),
|
||||||
sizeof(typename BlockLoadVecT::TempStorage),
|
sizeof(typename BlockLoadVecT::TempStorage),
|
||||||
sizeof(typename BlockLoadIndexT::TempStorage),
|
|
||||||
sizeof(typename BlockLoadIndexVecT::TempStorage),
|
|
||||||
(int(kIsVariableB) + int(kIsVariableC)) * sizeof(typename BlockLoadWeightT::TempStorage),
|
(int(kIsVariableB) + int(kIsVariableC)) * sizeof(typename BlockLoadWeightT::TempStorage),
|
||||||
(int(kIsVariableB) + int(kIsVariableC)) * sizeof(typename BlockLoadWeightVecT::TempStorage),
|
(int(kIsVariableB) + int(kIsVariableC)) * sizeof(typename BlockLoadWeightVecT::TempStorage),
|
||||||
sizeof(typename BlockStoreT::TempStorage),
|
sizeof(typename BlockStoreT::TempStorage),
|
||||||
@@ -80,7 +75,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
|
|||||||
constexpr bool kIsVariableB = Ktraits::kIsVariableB;
|
constexpr bool kIsVariableB = Ktraits::kIsVariableB;
|
||||||
constexpr bool kIsVariableC = Ktraits::kIsVariableC;
|
constexpr bool kIsVariableC = Ktraits::kIsVariableC;
|
||||||
constexpr bool kHasZ = Ktraits::kHasZ;
|
constexpr bool kHasZ = Ktraits::kHasZ;
|
||||||
constexpr bool kUseIndex = Ktraits::kUseIndex;
|
constexpr bool kVarlen = Ktraits::kVarlen;
|
||||||
constexpr int kNThreads = Ktraits::kNThreads;
|
constexpr int kNThreads = Ktraits::kNThreads;
|
||||||
constexpr int kNItems = Ktraits::kNItems;
|
constexpr int kNItems = Ktraits::kNItems;
|
||||||
constexpr int kNRows = Ktraits::kNRows;
|
constexpr int kNRows = Ktraits::kNRows;
|
||||||
@@ -97,7 +92,6 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
|
|||||||
// auto& smem_load = reinterpret_cast<typename BlockLoadT::TempStorage&>(smem_loadstorescan);
|
// auto& smem_load = reinterpret_cast<typename BlockLoadT::TempStorage&>(smem_loadstorescan);
|
||||||
auto& smem_load = reinterpret_cast<typename Ktraits::BlockLoadT::TempStorage&>(smem_);
|
auto& smem_load = reinterpret_cast<typename Ktraits::BlockLoadT::TempStorage&>(smem_);
|
||||||
auto& smem_load_weight = reinterpret_cast<typename Ktraits::BlockLoadWeightT::TempStorage&>(smem_);
|
auto& smem_load_weight = reinterpret_cast<typename Ktraits::BlockLoadWeightT::TempStorage&>(smem_);
|
||||||
auto& smem_load_index = reinterpret_cast<typename Ktraits::BlockLoadIndexT::TempStorage&>(smem_);
|
|
||||||
auto& smem_load_weight1 = *reinterpret_cast<typename Ktraits::BlockLoadWeightT::TempStorage*>(smem_ + sizeof(typename Ktraits::BlockLoadWeightT::TempStorage));
|
auto& smem_load_weight1 = *reinterpret_cast<typename Ktraits::BlockLoadWeightT::TempStorage*>(smem_ + sizeof(typename Ktraits::BlockLoadWeightT::TempStorage));
|
||||||
auto& smem_store = reinterpret_cast<typename Ktraits::BlockStoreT::TempStorage&>(smem_);
|
auto& smem_store = reinterpret_cast<typename Ktraits::BlockStoreT::TempStorage&>(smem_);
|
||||||
auto& smem_scan = *reinterpret_cast<typename Ktraits::BlockScanT::TempStorage*>(smem_ + Ktraits::kSmemIOSize);
|
auto& smem_scan = *reinterpret_cast<typename Ktraits::BlockScanT::TempStorage*>(smem_ + Ktraits::kSmemIOSize);
|
||||||
@@ -108,17 +102,29 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
|
|||||||
const int batch_id = blockIdx.x;
|
const int batch_id = blockIdx.x;
|
||||||
const int dim_id = blockIdx.y;
|
const int dim_id = blockIdx.y;
|
||||||
const int group_id = dim_id / (params.dim_ngroups_ratio);
|
const int group_id = dim_id / (params.dim_ngroups_ratio);
|
||||||
input_t *u = reinterpret_cast<input_t *>(params.u_ptr) + batch_id * params.u_batch_stride
|
int seqlen = params.seqlen;
|
||||||
|
int sequence_start_index = batch_id;
|
||||||
|
if constexpr (kVarlen){
|
||||||
|
int *query_start_loc = reinterpret_cast<int *>(params.query_start_loc_ptr);
|
||||||
|
sequence_start_index = query_start_loc[batch_id];
|
||||||
|
seqlen = query_start_loc[batch_id + 1] - sequence_start_index;
|
||||||
|
}
|
||||||
|
const bool has_initial_state = params.has_initial_state_ptr == nullptr ? false
|
||||||
|
: reinterpret_cast<bool *>(params.has_initial_state_ptr)[batch_id];
|
||||||
|
|
||||||
|
const int* cache_indices = params.cache_indices_ptr == nullptr ? nullptr
|
||||||
|
: reinterpret_cast<int *>(params.cache_indices_ptr);
|
||||||
|
const int cache_index = cache_indices == nullptr ? batch_id : cache_indices[batch_id];
|
||||||
|
input_t *u = reinterpret_cast<input_t *>(params.u_ptr) + sequence_start_index * params.u_batch_stride
|
||||||
+ dim_id * kNRows * params.u_d_stride;
|
+ dim_id * kNRows * params.u_d_stride;
|
||||||
input_t *delta = reinterpret_cast<input_t *>(params.delta_ptr) + batch_id * params.delta_batch_stride
|
input_t *delta = reinterpret_cast<input_t *>(params.delta_ptr) + sequence_start_index * params.delta_batch_stride
|
||||||
+ dim_id * kNRows * params.delta_d_stride;
|
+ dim_id * kNRows * params.delta_d_stride;
|
||||||
weight_t *A = reinterpret_cast<weight_t *>(params.A_ptr) + dim_id * kNRows * params.A_d_stride;
|
weight_t *A = reinterpret_cast<weight_t *>(params.A_ptr) + dim_id * kNRows * params.A_d_stride;
|
||||||
weight_t *B = reinterpret_cast<weight_t *>(params.B_ptr) + dim_id * kNRows * params.B_d_stride;
|
weight_t *B = reinterpret_cast<weight_t *>(params.B_ptr) + dim_id * kNRows * params.B_d_stride;
|
||||||
input_t *Bvar = reinterpret_cast<input_t *>(params.B_ptr) + batch_id * params.B_batch_stride + group_id * params.B_group_stride;
|
input_t *Bvar = reinterpret_cast<input_t *>(params.B_ptr) + sequence_start_index * params.B_batch_stride + group_id * params.B_group_stride;
|
||||||
weight_t *C = reinterpret_cast<weight_t *>(params.C_ptr) + dim_id * kNRows * params.C_d_stride;
|
weight_t *C = reinterpret_cast<weight_t *>(params.C_ptr) + dim_id * kNRows * params.C_d_stride;
|
||||||
input_t *Cvar = reinterpret_cast<input_t *>(params.C_ptr) + batch_id * params.C_batch_stride + group_id * params.C_group_stride;
|
input_t *Cvar = reinterpret_cast<input_t *>(params.C_ptr) + sequence_start_index * params.C_batch_stride + group_id * params.C_group_stride;
|
||||||
scan_t *x = reinterpret_cast<scan_t *>(params.x_ptr) + (batch_id * params.dim + dim_id * kNRows) * params.n_chunks * params.dstate;
|
input_t *ssm_states = reinterpret_cast<input_t *>(params.ssm_states_ptr) + (cache_index * params.dim + dim_id * kNRows) * params.dstate;
|
||||||
int *index = !kUseIndex ? nullptr :reinterpret_cast<int *>(params.index_ptr) + batch_id * params.seqlen;
|
|
||||||
|
|
||||||
float D_val[kNRows] = {0};
|
float D_val[kNRows] = {0};
|
||||||
if (params.D_ptr != nullptr) {
|
if (params.D_ptr != nullptr) {
|
||||||
@@ -142,9 +148,9 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
|
|||||||
// }
|
// }
|
||||||
|
|
||||||
constexpr int kChunkSize = kNThreads * kNItems;
|
constexpr int kChunkSize = kNThreads * kNItems;
|
||||||
for (int chunk = 0; chunk < params.n_chunks; ++chunk) {
|
const int n_chunks = (seqlen + 2048 - 1) / 2048;
|
||||||
|
for (int chunk = 0; chunk < n_chunks; ++chunk) {
|
||||||
input_t u_vals[kNRows][kNItems], delta_vals_load[kNRows][kNItems];
|
input_t u_vals[kNRows][kNItems], delta_vals_load[kNRows][kNItems];
|
||||||
int index_vals_load[kNRows][kNItems];
|
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
@@ -152,15 +158,9 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
|
|||||||
if constexpr (!kDirectIO) {
|
if constexpr (!kDirectIO) {
|
||||||
if (r > 0) { __syncthreads(); }
|
if (r > 0) { __syncthreads(); }
|
||||||
}
|
}
|
||||||
load_input<Ktraits>(u + r * params.u_d_stride, u_vals[r], smem_load, params.seqlen - chunk * kChunkSize);
|
load_input<Ktraits>(u + r * params.u_d_stride, u_vals[r], smem_load, seqlen - chunk * kChunkSize);
|
||||||
if constexpr (!kDirectIO) { __syncthreads(); }
|
if constexpr (!kDirectIO) { __syncthreads(); }
|
||||||
load_input<Ktraits>(delta + r * params.delta_d_stride, delta_vals_load[r], smem_load, params.seqlen - chunk * kChunkSize);
|
load_input<Ktraits>(delta + r * params.delta_d_stride, delta_vals_load[r], smem_load, seqlen - chunk * kChunkSize);
|
||||||
if constexpr (kUseIndex) {
|
|
||||||
load_index<Ktraits>(index + r * params.delta_d_stride, index_vals_load[r], smem_load_index, params.seqlen - chunk * kChunkSize);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if constexpr (kUseIndex) {
|
|
||||||
index += kChunkSize;
|
|
||||||
}
|
}
|
||||||
u += kChunkSize;
|
u += kChunkSize;
|
||||||
delta += kChunkSize;
|
delta += kChunkSize;
|
||||||
@@ -195,9 +195,9 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
|
|||||||
// If both B and C vary, this is unused.
|
// If both B and C vary, this is unused.
|
||||||
weight_t BC_val[kNRows];
|
weight_t BC_val[kNRows];
|
||||||
weight_t B_vals[kNItems], C_vals[kNItems];
|
weight_t B_vals[kNItems], C_vals[kNItems];
|
||||||
if constexpr (kIsVariableB) {
|
if constexpr (kIsVariableB) {
|
||||||
load_weight<Ktraits>(Bvar + state_idx * params.B_dstate_stride, B_vals,
|
load_weight<Ktraits>(Bvar + state_idx * params.B_dstate_stride, B_vals,
|
||||||
smem_load_weight, (params.seqlen - chunk * kChunkSize) * (1));
|
smem_load_weight, (seqlen - chunk * kChunkSize) * (1));
|
||||||
if constexpr (!kIsVariableC) {
|
if constexpr (!kIsVariableC) {
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int r = 0; r < kNRows; ++r) {
|
for (int r = 0; r < kNRows; ++r) {
|
||||||
@@ -208,7 +208,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
|
|||||||
if constexpr (kIsVariableC) {
|
if constexpr (kIsVariableC) {
|
||||||
auto &smem_load_weight_C = !kIsVariableB ? smem_load_weight : smem_load_weight1;
|
auto &smem_load_weight_C = !kIsVariableB ? smem_load_weight : smem_load_weight1;
|
||||||
load_weight<Ktraits>(Cvar + state_idx * params.C_dstate_stride, C_vals,
|
load_weight<Ktraits>(Cvar + state_idx * params.C_dstate_stride, C_vals,
|
||||||
smem_load_weight_C, (params.seqlen - chunk * kChunkSize) * (1 ));
|
smem_load_weight_C, (seqlen - chunk * kChunkSize) * (1 ));
|
||||||
if constexpr (!kIsVariableB) {
|
if constexpr (!kIsVariableB) {
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int r = 0; r < kNRows; ++r) {
|
for (int r = 0; r < kNRows; ++r) {
|
||||||
@@ -232,24 +232,16 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
|
|||||||
thread_data[i] = make_float2(exp2f(delta_vals[r][i] * A_val[r]),
|
thread_data[i] = make_float2(exp2f(delta_vals[r][i] * A_val[r]),
|
||||||
!kIsVariableB ? delta_u_vals[r][i] : B_vals[i] * delta_u_vals[r][i]);
|
!kIsVariableB ? delta_u_vals[r][i] : B_vals[i] * delta_u_vals[r][i]);
|
||||||
|
|
||||||
// Reset A bar for cumulative sequences (Real)
|
if (seqlen % (kNItems * kNThreads) != 0) { // So that the last state is correct
|
||||||
if constexpr (kUseIndex) {
|
if (threadIdx.x * kNItems + i >= seqlen - chunk * kChunkSize) {
|
||||||
if (index_vals_load[r][i] == 0) {
|
|
||||||
thread_data[i].x = 0.f;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if constexpr (!Ktraits::kIsEvenLen) { // So that the last state is correct
|
|
||||||
if (threadIdx.x * kNItems + i >= params.seqlen - chunk * kChunkSize) {
|
|
||||||
thread_data[i] = make_float2(1.f, 0.f);
|
thread_data[i] = make_float2(1.f, 0.f);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Initialize running total
|
// Initialize running total
|
||||||
scan_t running_prefix;
|
|
||||||
// If we use WARP_SCAN then all lane 0 of all warps (not just thread 0) needs to read
|
scan_t running_prefix = chunk > 0 ? smem_running_prefix[state_idx + r * MAX_DSTATE] : make_float2(1.0, has_initial_state ? float(ssm_states[state_idx]): 0.0);
|
||||||
running_prefix = chunk == 0 ? x[(r * params.n_chunks) * params.dstate + state_idx] : ( threadIdx.x % 32 == 0 ? smem_running_prefix[state_idx + r * MAX_DSTATE] : make_float2(1.f, 0.f));
|
|
||||||
// running_prefix = chunk > 0 && threadIdx.x == 0 ? smem_running_prefix[state_idx] : make_float2(1.f, 0.f);
|
|
||||||
SSMScanPrefixCallbackOp<weight_t> prefix_op(running_prefix);
|
SSMScanPrefixCallbackOp<weight_t> prefix_op(running_prefix);
|
||||||
typename Ktraits::BlockScanT(smem_scan).InclusiveScan(
|
typename Ktraits::BlockScanT(smem_scan).InclusiveScan(
|
||||||
thread_data, thread_data, SSMScanOp<weight_t>(), prefix_op
|
thread_data, thread_data, SSMScanOp<weight_t>(), prefix_op
|
||||||
@@ -258,7 +250,9 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
|
|||||||
// Unless there's only 1 warp, but then it's the same thread (0) reading and writing.
|
// Unless there's only 1 warp, but then it's the same thread (0) reading and writing.
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
smem_running_prefix[state_idx] = prefix_op.running_prefix;
|
smem_running_prefix[state_idx] = prefix_op.running_prefix;
|
||||||
x[(r * params.n_chunks + chunk) * params.dstate + state_idx] = prefix_op.running_prefix;
|
if (chunk == n_chunks - 1) {
|
||||||
|
ssm_states[state_idx] = input_t(prefix_op.running_prefix.y);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < kNItems; ++i) {
|
for (int i = 0; i < kNItems; ++i) {
|
||||||
@@ -270,7 +264,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + batch_id * params.out_batch_stride
|
input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + sequence_start_index * params.out_batch_stride
|
||||||
+ dim_id * kNRows * params.out_d_stride + chunk * kChunkSize;
|
+ dim_id * kNRows * params.out_d_stride + chunk * kChunkSize;
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
@@ -278,26 +272,26 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
|
|||||||
if constexpr (!kDirectIO) {
|
if constexpr (!kDirectIO) {
|
||||||
if (r > 0) { __syncthreads(); }
|
if (r > 0) { __syncthreads(); }
|
||||||
}
|
}
|
||||||
store_output<Ktraits>(out + r * params.out_d_stride, out_vals[r], smem_store, params.seqlen - chunk * kChunkSize);
|
store_output<Ktraits>(out + r * params.out_d_stride, out_vals[r], smem_store, seqlen - chunk * kChunkSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
if constexpr (kHasZ) {
|
if constexpr (kHasZ) {
|
||||||
input_t *z = reinterpret_cast<input_t *>(params.z_ptr) + batch_id * params.z_batch_stride
|
input_t *z = reinterpret_cast<input_t *>(params.z_ptr) + sequence_start_index * params.z_batch_stride
|
||||||
+ dim_id * kNRows * params.z_d_stride + chunk * kChunkSize;
|
+ dim_id * kNRows * params.z_d_stride + chunk * kChunkSize;
|
||||||
input_t *out_z = reinterpret_cast<input_t *>(params.out_z_ptr) + batch_id * params.out_z_batch_stride
|
input_t *out_z = reinterpret_cast<input_t *>(params.out_z_ptr) + sequence_start_index * params.out_z_batch_stride
|
||||||
+ dim_id * kNRows * params.out_z_d_stride + chunk * kChunkSize;
|
+ dim_id * kNRows * params.out_z_d_stride + chunk * kChunkSize;
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int r = 0; r < kNRows; ++r) {
|
for (int r = 0; r < kNRows; ++r) {
|
||||||
input_t z_vals[kNItems];
|
input_t z_vals[kNItems];
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
load_input<Ktraits>(z + r * params.z_d_stride, z_vals, smem_load, params.seqlen - chunk * kChunkSize);
|
load_input<Ktraits>(z + r * params.z_d_stride, z_vals, smem_load, seqlen - chunk * kChunkSize);
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < kNItems; ++i) {
|
for (int i = 0; i < kNItems; ++i) {
|
||||||
float z_val = z_vals[i];
|
float z_val = z_vals[i];
|
||||||
out_vals[r][i] *= z_val / (1 + expf(-z_val));
|
out_vals[r][i] *= z_val / (1 + expf(-z_val));
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
store_output<Ktraits>(out_z + r * params.out_z_d_stride, out_vals[r], smem_store, params.seqlen - chunk * kChunkSize);
|
store_output<Ktraits>(out_z + r * params.out_z_d_stride, out_vals[r], smem_store, seqlen - chunk * kChunkSize);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -316,8 +310,8 @@ void selective_scan_fwd_launch(SSMParamsBase ¶ms, cudaStream_t stream) {
|
|||||||
constexpr bool kIsVariableC = true;
|
constexpr bool kIsVariableC = true;
|
||||||
constexpr bool kHasZ = true;
|
constexpr bool kHasZ = true;
|
||||||
BOOL_SWITCH(params.seqlen % (kNThreads * kNItems) == 0, kIsEvenLen, [&] {
|
BOOL_SWITCH(params.seqlen % (kNThreads * kNItems) == 0, kIsEvenLen, [&] {
|
||||||
BOOL_SWITCH(params.index_ptr != nullptr , kUseIndex, [&] {
|
BOOL_SWITCH(params.query_start_loc_ptr != nullptr , kVarlen, [&] {
|
||||||
using Ktraits = Selective_Scan_fwd_kernel_traits<kNThreads, kNItems, kNRows, kIsEvenLen, kIsVariableB, kIsVariableC, kHasZ, kUseIndex, input_t, weight_t>;
|
using Ktraits = Selective_Scan_fwd_kernel_traits<kNThreads, kNItems, kNRows, kIsEvenLen, kIsVariableB, kIsVariableC, kHasZ, kVarlen, input_t, weight_t>;
|
||||||
constexpr int kSmemSize = Ktraits::kSmemSize + kNRows * MAX_DSTATE * sizeof(typename Ktraits::scan_t);
|
constexpr int kSmemSize = Ktraits::kSmemSize + kNRows * MAX_DSTATE * sizeof(typename Ktraits::scan_t);
|
||||||
dim3 grid(params.batch, params.dim / kNRows);
|
dim3 grid(params.batch, params.dim / kNRows);
|
||||||
auto kernel = &selective_scan_fwd_kernel<Ktraits>;
|
auto kernel = &selective_scan_fwd_kernel<Ktraits>;
|
||||||
@@ -405,12 +399,15 @@ void set_ssm_params_fwd(SSMParamsBase ¶ms,
|
|||||||
const torch::Tensor out,
|
const torch::Tensor out,
|
||||||
const torch::Tensor z,
|
const torch::Tensor z,
|
||||||
const torch::Tensor out_z,
|
const torch::Tensor out_z,
|
||||||
void* D_ptr,
|
const c10::optional<at::Tensor>& D,
|
||||||
void* delta_bias_ptr,
|
const c10::optional<at::Tensor>& delta_bias,
|
||||||
void* x_ptr,
|
const torch::Tensor ssm_states,
|
||||||
bool has_z,
|
bool has_z,
|
||||||
bool delta_softplus,
|
bool delta_softplus,
|
||||||
void* index_ptr) {
|
const c10::optional<at::Tensor>& query_start_loc,
|
||||||
|
const c10::optional<at::Tensor>& cache_indices,
|
||||||
|
const c10::optional<at::Tensor>& has_initial_state,
|
||||||
|
bool varlen) {
|
||||||
|
|
||||||
// Reset the parameters
|
// Reset the parameters
|
||||||
memset(¶ms, 0, sizeof(params));
|
memset(¶ms, 0, sizeof(params));
|
||||||
@@ -434,55 +431,83 @@ void set_ssm_params_fwd(SSMParamsBase ¶ms,
|
|||||||
params.A_ptr = A.data_ptr();
|
params.A_ptr = A.data_ptr();
|
||||||
params.B_ptr = B.data_ptr();
|
params.B_ptr = B.data_ptr();
|
||||||
params.C_ptr = C.data_ptr();
|
params.C_ptr = C.data_ptr();
|
||||||
params.D_ptr = D_ptr;
|
params.D_ptr = D.has_value() ? D.value().data_ptr() : nullptr;
|
||||||
params.delta_bias_ptr = delta_bias_ptr;
|
params.delta_bias_ptr = delta_bias.has_value() ? delta_bias.value().data_ptr() : nullptr;
|
||||||
params.out_ptr = out.data_ptr();
|
params.out_ptr = out.data_ptr();
|
||||||
params.x_ptr = x_ptr;
|
params.ssm_states_ptr = ssm_states.data_ptr();
|
||||||
params.z_ptr = has_z ? z.data_ptr() : nullptr;
|
params.z_ptr = has_z ? z.data_ptr() : nullptr;
|
||||||
params.out_z_ptr = has_z ? out_z.data_ptr() : nullptr;
|
params.out_z_ptr = has_z ? out_z.data_ptr() : nullptr;
|
||||||
|
params.query_start_loc_ptr = query_start_loc.has_value() ? query_start_loc.value().data_ptr() : nullptr;
|
||||||
|
params.cache_indices_ptr = cache_indices.has_value() ? cache_indices.value().data_ptr() : nullptr;
|
||||||
|
params.has_initial_state_ptr = has_initial_state.has_value() ? has_initial_state.value().data_ptr() : nullptr;
|
||||||
|
|
||||||
params.index_ptr = index_ptr;
|
|
||||||
|
|
||||||
// All stride are in elements, not bytes.
|
// All stride are in elements, not bytes.
|
||||||
params.A_d_stride = A.stride(0);
|
params.A_d_stride = A.stride(0);
|
||||||
params.A_dstate_stride = A.stride(1);
|
params.A_dstate_stride = A.stride(1);
|
||||||
if (!is_variable_B) {
|
|
||||||
params.B_d_stride = B.stride(0);
|
if (varlen){
|
||||||
} else {
|
params.B_batch_stride = B.stride(2);
|
||||||
params.B_batch_stride = B.stride(0);
|
params.B_group_stride = B.stride(0);
|
||||||
params.B_group_stride = B.stride(1);
|
params.B_dstate_stride = B.stride(1);
|
||||||
|
params.C_batch_stride = C.stride(2);
|
||||||
|
params.C_group_stride = C.stride(0);
|
||||||
|
params.C_dstate_stride = C.stride(1);
|
||||||
|
|
||||||
|
params.u_batch_stride = u.stride(1);
|
||||||
|
params.u_d_stride = u.stride(0);
|
||||||
|
params.delta_batch_stride = delta.stride(1);
|
||||||
|
params.delta_d_stride = delta.stride(0);
|
||||||
|
if (has_z) {
|
||||||
|
params.z_batch_stride = z.stride(1);
|
||||||
|
params.z_d_stride = z.stride(0);
|
||||||
|
params.out_z_batch_stride = out_z.stride(1);
|
||||||
|
params.out_z_d_stride = out_z.stride(0);
|
||||||
|
}
|
||||||
|
params.out_batch_stride = out.stride(1);
|
||||||
|
params.out_d_stride = out.stride(0);
|
||||||
|
|
||||||
}
|
}
|
||||||
params.B_dstate_stride = !is_variable_B ? B.stride(1) : B.stride(2);
|
else{
|
||||||
if (!is_variable_C) {
|
if (!is_variable_B) {
|
||||||
params.C_d_stride = C.stride(0);
|
params.B_d_stride = B.stride(0);
|
||||||
} else {
|
} else {
|
||||||
params.C_batch_stride = C.stride(0);
|
params.B_batch_stride = B.stride(0);
|
||||||
params.C_group_stride = C.stride(1);
|
params.B_group_stride = B.stride(1);
|
||||||
|
}
|
||||||
|
params.B_dstate_stride = !is_variable_B ? B.stride(1) : B.stride(2);
|
||||||
|
if (!is_variable_C) {
|
||||||
|
params.C_d_stride = C.stride(0);
|
||||||
|
} else {
|
||||||
|
params.C_batch_stride = C.stride(0);
|
||||||
|
params.C_group_stride = C.stride(1);
|
||||||
|
}
|
||||||
|
params.C_dstate_stride = !is_variable_C ? C.stride(1) : C.stride(2);
|
||||||
|
params.u_batch_stride = u.stride(0);
|
||||||
|
params.u_d_stride = u.stride(1);
|
||||||
|
params.delta_batch_stride = delta.stride(0);
|
||||||
|
params.delta_d_stride = delta.stride(1);
|
||||||
|
if (has_z) {
|
||||||
|
params.z_batch_stride = z.stride(0);
|
||||||
|
params.z_d_stride = z.stride(1);
|
||||||
|
params.out_z_batch_stride = out_z.stride(0);
|
||||||
|
params.out_z_d_stride = out_z.stride(1);
|
||||||
|
}
|
||||||
|
params.out_batch_stride = out.stride(0);
|
||||||
|
params.out_d_stride = out.stride(1);
|
||||||
}
|
}
|
||||||
params.C_dstate_stride = !is_variable_C ? C.stride(1) : C.stride(2);
|
|
||||||
params.u_batch_stride = u.stride(0);
|
|
||||||
params.u_d_stride = u.stride(1);
|
|
||||||
params.delta_batch_stride = delta.stride(0);
|
|
||||||
params.delta_d_stride = delta.stride(1);
|
|
||||||
if (has_z) {
|
|
||||||
params.z_batch_stride = z.stride(0);
|
|
||||||
params.z_d_stride = z.stride(1);
|
|
||||||
params.out_z_batch_stride = out_z.stride(0);
|
|
||||||
params.out_z_d_stride = out_z.stride(1);
|
|
||||||
}
|
|
||||||
params.out_batch_stride = out.stride(0);
|
|
||||||
params.out_d_stride = out.stride(1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<torch::Tensor>
|
void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
|
||||||
selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
|
|
||||||
const torch::Tensor &A, const torch::Tensor &B, const torch::Tensor &C,
|
const torch::Tensor &A, const torch::Tensor &B, const torch::Tensor &C,
|
||||||
const c10::optional<torch::Tensor> &D_,
|
const c10::optional<torch::Tensor> &D_,
|
||||||
const c10::optional<torch::Tensor> &z_,
|
const c10::optional<torch::Tensor> &z_,
|
||||||
const c10::optional<torch::Tensor> &delta_bias_,
|
const c10::optional<torch::Tensor> &delta_bias_,
|
||||||
bool delta_softplus,
|
bool delta_softplus,
|
||||||
const c10::optional<torch::Tensor> &index_,
|
const c10::optional<torch::Tensor> &query_start_loc,
|
||||||
const c10::optional<torch::Tensor> &x) {
|
const c10::optional<torch::Tensor> &cache_indices,
|
||||||
|
const c10::optional<torch::Tensor> &has_initial_state,
|
||||||
|
const torch::Tensor &ssm_states) {
|
||||||
auto input_type = u.scalar_type();
|
auto input_type = u.scalar_type();
|
||||||
auto weight_type = A.scalar_type();
|
auto weight_type = A.scalar_type();
|
||||||
TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
|
TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
|
||||||
@@ -505,23 +530,37 @@ selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
|
|||||||
TORCH_CHECK(delta.stride(-1) == 1 || delta.size(-1) == 1);
|
TORCH_CHECK(delta.stride(-1) == 1 || delta.size(-1) == 1);
|
||||||
|
|
||||||
const auto sizes = u.sizes();
|
const auto sizes = u.sizes();
|
||||||
const int batch_size = sizes[0];
|
const bool varlen = query_start_loc.has_value();
|
||||||
const int dim = sizes[1];
|
const int batch_size = varlen ? query_start_loc.value().sizes()[0] - 1 : sizes[0];
|
||||||
const int seqlen = sizes[2];
|
const int dim = varlen ? sizes[0] : sizes[1];
|
||||||
|
const int seqlen = varlen ? sizes[1] : sizes[2];
|
||||||
const int dstate = A.size(1);
|
const int dstate = A.size(1);
|
||||||
const int n_groups = is_variable_B ? B.size(1) : 1;
|
const int n_groups = varlen ? B.size(0) : B.size(1);
|
||||||
|
|
||||||
TORCH_CHECK(dstate <= 256, "selective_scan only supports state dimension <= 256");
|
TORCH_CHECK(dstate <= 256, "selective_scan only supports state dimension <= 256");
|
||||||
|
|
||||||
CHECK_SHAPE(u, batch_size, dim, seqlen);
|
if (varlen) {
|
||||||
CHECK_SHAPE(delta, batch_size, dim, seqlen);
|
CHECK_SHAPE(u, dim, seqlen);
|
||||||
|
CHECK_SHAPE(delta, dim, seqlen);
|
||||||
|
} else {
|
||||||
|
CHECK_SHAPE(u, batch_size, dim, seqlen);
|
||||||
|
CHECK_SHAPE(delta, batch_size, dim, seqlen);
|
||||||
|
}
|
||||||
CHECK_SHAPE(A, dim, dstate);
|
CHECK_SHAPE(A, dim, dstate);
|
||||||
TORCH_CHECK(is_variable_B, "is_variable_B = False is disabled in favor of reduced binary size")
|
TORCH_CHECK(is_variable_B, "is_variable_B = False is disabled in favor of reduced binary size")
|
||||||
CHECK_SHAPE(B, batch_size, n_groups, dstate, seqlen );
|
if (varlen) {
|
||||||
|
CHECK_SHAPE(B, n_groups, dstate, seqlen);
|
||||||
|
} else {
|
||||||
|
CHECK_SHAPE(B, batch_size, n_groups, dstate, seqlen);
|
||||||
|
}
|
||||||
TORCH_CHECK(B.stride(-1) == 1 || B.size(-1) == 1);
|
TORCH_CHECK(B.stride(-1) == 1 || B.size(-1) == 1);
|
||||||
|
|
||||||
TORCH_CHECK(is_variable_C, "is_variable_C = False is disabled in favor of reduced binary size")
|
TORCH_CHECK(is_variable_C, "is_variable_C = False is disabled in favor of reduced binary size")
|
||||||
CHECK_SHAPE(C, batch_size, n_groups, dstate, seqlen);
|
if (varlen) {
|
||||||
|
CHECK_SHAPE(C, n_groups, dstate, seqlen);
|
||||||
|
} else {
|
||||||
|
CHECK_SHAPE(C, batch_size, n_groups, dstate, seqlen);
|
||||||
|
}
|
||||||
TORCH_CHECK(C.stride(-1) == 1 || C.size(-1) == 1);
|
TORCH_CHECK(C.stride(-1) == 1 || C.size(-1) == 1);
|
||||||
|
|
||||||
if (D_.has_value()) {
|
if (D_.has_value()) {
|
||||||
@@ -539,13 +578,31 @@ selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
|
|||||||
TORCH_CHECK(delta_bias.stride(-1) == 1 || delta_bias.size(-1) == 1);
|
TORCH_CHECK(delta_bias.stride(-1) == 1 || delta_bias.size(-1) == 1);
|
||||||
CHECK_SHAPE(delta_bias, dim);
|
CHECK_SHAPE(delta_bias, dim);
|
||||||
}
|
}
|
||||||
if (index_.has_value()) {
|
|
||||||
auto index = index_.value();
|
|
||||||
TORCH_CHECK(index.scalar_type() == at::ScalarType::Int);
|
if (has_initial_state.has_value()) {
|
||||||
TORCH_CHECK(index.is_cuda());
|
auto has_initial_state_ = has_initial_state.value();
|
||||||
CHECK_SHAPE(index, batch_size, seqlen);
|
TORCH_CHECK(has_initial_state_.scalar_type() == at::ScalarType::Bool);
|
||||||
|
TORCH_CHECK(has_initial_state_.is_cuda());
|
||||||
|
CHECK_SHAPE(has_initial_state_, batch_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if (query_start_loc.has_value()) {
|
||||||
|
auto query_start_loc_ = query_start_loc.value();
|
||||||
|
TORCH_CHECK(query_start_loc_.scalar_type() == at::ScalarType::Int);
|
||||||
|
TORCH_CHECK(query_start_loc_.is_cuda());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if (cache_indices.has_value()) {
|
||||||
|
auto cache_indices_ = cache_indices.value();
|
||||||
|
TORCH_CHECK(cache_indices_.scalar_type() == at::ScalarType::Int);
|
||||||
|
TORCH_CHECK(cache_indices_.is_cuda());
|
||||||
|
CHECK_SHAPE(cache_indices_, batch_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
at::Tensor z, out_z;
|
at::Tensor z, out_z;
|
||||||
const bool has_z = z_.has_value();
|
const bool has_z = z_.has_value();
|
||||||
TORCH_CHECK(has_z, "has_z = False is disabled in favor of reduced binary size")
|
TORCH_CHECK(has_z, "has_z = False is disabled in favor of reduced binary size")
|
||||||
@@ -553,31 +610,38 @@ selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
|
|||||||
TORCH_CHECK(z.scalar_type() == input_type);
|
TORCH_CHECK(z.scalar_type() == input_type);
|
||||||
TORCH_CHECK(z.is_cuda());
|
TORCH_CHECK(z.is_cuda());
|
||||||
TORCH_CHECK(z.stride(-1) == 1 || z.size(-1) == 1);
|
TORCH_CHECK(z.stride(-1) == 1 || z.size(-1) == 1);
|
||||||
CHECK_SHAPE(z, batch_size, dim, seqlen);
|
if (varlen){
|
||||||
out_z = torch::empty_like(z);
|
CHECK_SHAPE(z, dim, seqlen);
|
||||||
|
} else {
|
||||||
|
CHECK_SHAPE(z, batch_size, dim, seqlen);
|
||||||
|
}
|
||||||
|
|
||||||
|
out_z = z;
|
||||||
|
|
||||||
const int n_chunks = (seqlen + 2048 - 1) / 2048;
|
const int n_chunks = (seqlen + 2048 - 1) / 2048;
|
||||||
// const int n_chunks = (seqlen + 1024 - 1) / 1024;
|
// const int n_chunks = (seqlen + 1024 - 1) / 1024;
|
||||||
// at::Tensor out = torch::empty_like(u);
|
// at::Tensor out = torch::empty_like(u);
|
||||||
// Right now u has BHL layout and delta has HBL layout, and we want out to have HBL layout
|
// Right now u has BHL layout and delta has HBL layout, and we want out to have HBL layout
|
||||||
at::Tensor out = torch::empty_like(delta);
|
at::Tensor out = delta;
|
||||||
if (x.has_value()){
|
TORCH_CHECK(ssm_states.scalar_type() == input_type);
|
||||||
auto _x = x.value();
|
TORCH_CHECK(ssm_states.is_cuda());
|
||||||
TORCH_CHECK(_x.scalar_type() == weight_type);
|
TORCH_CHECK(ssm_states.stride(-1) == 1);
|
||||||
TORCH_CHECK(_x.is_cuda());
|
CHECK_SHAPE(ssm_states, batch_size, dim, dstate);
|
||||||
TORCH_CHECK(_x.stride(-1) == 1);
|
|
||||||
CHECK_SHAPE(_x, batch_size, dim, n_chunks, dstate * 2);
|
|
||||||
}
|
|
||||||
|
|
||||||
SSMParamsBase params;
|
SSMParamsBase params;
|
||||||
set_ssm_params_fwd(params, batch_size, dim, seqlen, dstate, n_groups, n_chunks, is_variable_B, is_variable_C,
|
set_ssm_params_fwd(params, batch_size, dim, seqlen, dstate, n_groups, n_chunks, is_variable_B, is_variable_C,
|
||||||
u, delta, A, B, C, out, z, out_z,
|
u, delta, A, B, C, out, z, out_z,
|
||||||
D_.has_value() ? D_.value().data_ptr() : nullptr,
|
D_,
|
||||||
delta_bias_.has_value() ? delta_bias_.value().data_ptr() : nullptr,
|
delta_bias_,
|
||||||
x.value().data_ptr(),
|
ssm_states,
|
||||||
has_z,
|
has_z,
|
||||||
delta_softplus,
|
delta_softplus,
|
||||||
index_.has_value() ? index_.value().data_ptr() : nullptr);
|
query_start_loc,
|
||||||
|
cache_indices,
|
||||||
|
has_initial_state,
|
||||||
|
varlen
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
// Otherwise the kernel will be launched from cuda:0 device
|
// Otherwise the kernel will be launched from cuda:0 device
|
||||||
// Cast to char to avoid compiler warning about narrowing
|
// Cast to char to avoid compiler warning about narrowing
|
||||||
@@ -586,8 +650,5 @@ selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
|
|||||||
DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), "selective_scan_fwd", [&] {
|
DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), "selective_scan_fwd", [&] {
|
||||||
selective_scan_fwd_cuda<input_t, weight_t>(params, stream);
|
selective_scan_fwd_cuda<input_t, weight_t>(params, stream);
|
||||||
});
|
});
|
||||||
std::vector<at::Tensor> result = {out};
|
|
||||||
if (has_z) { result.push_back(out_z); }
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -38,6 +38,7 @@ using FragA = Vec<half2, 4>;
|
|||||||
using FragB = Vec<half2, 2>;
|
using FragB = Vec<half2, 2>;
|
||||||
using FragC = Vec<float, 4>;
|
using FragC = Vec<float, 4>;
|
||||||
using FragS = Vec<half2, 1>; // quantization scales
|
using FragS = Vec<half2, 1>; // quantization scales
|
||||||
|
using FragZP = Vec<half2, 4>;
|
||||||
|
|
||||||
// Predicated asynchronous global->shared copy; used for inputs A where we apply
|
// Predicated asynchronous global->shared copy; used for inputs A where we apply
|
||||||
// predication to handle batchsizes that are not multiples of 16.
|
// predication to handle batchsizes that are not multiples of 16.
|
||||||
@@ -175,6 +176,46 @@ __device__ inline FragB dequant<vllm::kU8B128.id()>(int q) {
|
|||||||
return frag_b;
|
return frag_b;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
__device__ inline FragB dequant<vllm::kU4.id()>(int q) {
|
||||||
|
const int LO = 0x000f000f;
|
||||||
|
const int HI = 0x00f000f0;
|
||||||
|
const int EX = 0x64006400;
|
||||||
|
// Guarantee that the `(a & b) | c` operations are LOP3s.
|
||||||
|
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
|
||||||
|
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
|
||||||
|
|
||||||
|
const int SUB = 0x64006400;
|
||||||
|
const int MUL = 0x2c002c00;
|
||||||
|
const int ADD = 0xd400d400;
|
||||||
|
FragB frag_b;
|
||||||
|
frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
|
||||||
|
*reinterpret_cast<const half2*>(&SUB));
|
||||||
|
frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
|
||||||
|
*reinterpret_cast<const half2*>(&MUL),
|
||||||
|
*reinterpret_cast<const half2*>(&ADD));
|
||||||
|
return frag_b;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
__device__ inline FragB dequant<vllm::kU8.id()>(int q) {
|
||||||
|
static constexpr uint32_t mask_for_elt_01 = 0x5250;
|
||||||
|
static constexpr uint32_t mask_for_elt_23 = 0x5351;
|
||||||
|
static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
|
||||||
|
|
||||||
|
uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
|
||||||
|
uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);
|
||||||
|
|
||||||
|
static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64006400;
|
||||||
|
|
||||||
|
FragB frag_b;
|
||||||
|
frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
|
||||||
|
*reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
|
||||||
|
frag_b[1] = __hsub2(*reinterpret_cast<half2*>(&hi),
|
||||||
|
*reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
|
||||||
|
return frag_b;
|
||||||
|
}
|
||||||
|
|
||||||
// Multiply dequantized values by the corresponding quantization scale; used
|
// Multiply dequantized values by the corresponding quantization scale; used
|
||||||
// only for grouped quantization.
|
// only for grouped quantization.
|
||||||
__device__ inline void scale(FragB& frag_b, FragS& frag_s, int i) {
|
__device__ inline void scale(FragB& frag_b, FragS& frag_s, int i) {
|
||||||
@@ -183,11 +224,10 @@ __device__ inline void scale(FragB& frag_b, FragS& frag_s, int i) {
|
|||||||
frag_b[1] = __hmul2(frag_b[1], s);
|
frag_b[1] = __hmul2(frag_b[1], s);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Given 2 floats multiply by 2 scales (halves)
|
__device__ inline void sub_zp(FragB& frag_b, half2& frag_zp, int i) {
|
||||||
__device__ inline void scale_float(float* c, FragS& s) {
|
half2 zp = __half2half2(reinterpret_cast<__half*>(&frag_zp)[i]);
|
||||||
__half* s_ptr = reinterpret_cast<__half*>(&s);
|
frag_b[0] = __hsub2(frag_b[0], zp);
|
||||||
c[0] = __fmul_rn(c[0], __half2float(s_ptr[0]));
|
frag_b[1] = __hsub2(frag_b[1], zp);
|
||||||
c[1] = __fmul_rn(c[1], __half2float(s_ptr[1]));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Same as above, but for act_order (each K is multiplied individually)
|
// Same as above, but for act_order (each K is multiplied individually)
|
||||||
@@ -205,6 +245,13 @@ __device__ inline void scale4(FragB& frag_b, FragS& frag_s_1, FragS& frag_s_2,
|
|||||||
frag_b[1] = __hmul2(frag_b[1], s_val_3_4);
|
frag_b[1] = __hmul2(frag_b[1], s_val_3_4);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Given 2 floats multiply by 2 scales (halves)
|
||||||
|
__device__ inline void scale_float(float* c, FragS& s) {
|
||||||
|
__half* s_ptr = reinterpret_cast<__half*>(&s);
|
||||||
|
c[0] = __fmul_rn(c[0], __half2float(s_ptr[0]));
|
||||||
|
c[1] = __fmul_rn(c[1], __half2float(s_ptr[1]));
|
||||||
|
}
|
||||||
|
|
||||||
// Wait until barrier reaches `count`, then lock for current threadblock.
|
// Wait until barrier reaches `count`, then lock for current threadblock.
|
||||||
__device__ inline void barrier_acquire(int* lock, int count) {
|
__device__ inline void barrier_acquire(int* lock, int count) {
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
@@ -248,10 +295,11 @@ template <const vllm::ScalarTypeId w_type_id, // weight ScalarType id
|
|||||||
const int stages, // number of stages for the async global->shared
|
const int stages, // number of stages for the async global->shared
|
||||||
// fetch pipeline
|
// fetch pipeline
|
||||||
const bool has_act_order, // whether act_order is enabled
|
const bool has_act_order, // whether act_order is enabled
|
||||||
|
const bool has_zp, // whether zero-points are enabled
|
||||||
const int group_blocks = -1 // number of consecutive 16x16 blocks
|
const int group_blocks = -1 // number of consecutive 16x16 blocks
|
||||||
// with a separate quantization scale
|
// with a separate quantization scale
|
||||||
>
|
>
|
||||||
__device__ inline void MarlinMoESingle(
|
__device__ void MarlinMoESingle(
|
||||||
const int4* __restrict__ A, // fp16 input matrix of shape mxk
|
const int4* __restrict__ A, // fp16 input matrix of shape mxk
|
||||||
const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn
|
const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn
|
||||||
int4* __restrict__ C, // fp16 output buffer of shape mxn
|
int4* __restrict__ C, // fp16 output buffer of shape mxn
|
||||||
@@ -259,6 +307,8 @@ __device__ inline void MarlinMoESingle(
|
|||||||
const float* __restrict__ topk_weights, // float topk weights
|
const float* __restrict__ topk_weights, // float topk weights
|
||||||
const int4* __restrict__ scales_ptr, // fp16 quantization scales of shape
|
const int4* __restrict__ scales_ptr, // fp16 quantization scales of shape
|
||||||
// (k/groupsize)xn
|
// (k/groupsize)xn
|
||||||
|
const int4* __restrict__ zp_ptr, // 4bit packed zero-points of shape
|
||||||
|
// (k/groupsize)x(n/pack_factor)
|
||||||
const int* __restrict__ g_idx, // int32 group indices of shape k
|
const int* __restrict__ g_idx, // int32 group indices of shape k
|
||||||
const int* __restrict__ expert_offsets,
|
const int* __restrict__ expert_offsets,
|
||||||
int num_groups, // number of scale groups per output channel
|
int num_groups, // number of scale groups per output channel
|
||||||
@@ -400,8 +450,12 @@ __device__ inline void MarlinMoESingle(
|
|||||||
int tb_n_warps = thread_n_blocks / 4;
|
int tb_n_warps = thread_n_blocks / 4;
|
||||||
int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;
|
int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;
|
||||||
|
|
||||||
constexpr int sorted_sh_stride = threads;
|
// Zero-points sizes/strides
|
||||||
constexpr int sorted_gl_stride = threads;
|
int zp_gl_stride = (prob_n / pack_factor) / 4;
|
||||||
|
constexpr int zp_sh_stride = ((16 * thread_n_blocks) / pack_factor) / 4;
|
||||||
|
constexpr int zp_tb_groups = s_tb_groups;
|
||||||
|
constexpr int zp_sh_stage = has_zp ? zp_tb_groups * zp_sh_stride : 0;
|
||||||
|
int zp_gl_rd_delta = zp_gl_stride;
|
||||||
|
|
||||||
// Global A read index of current thread.
|
// Global A read index of current thread.
|
||||||
int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
|
int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
|
||||||
@@ -442,6 +496,19 @@ __device__ inline void MarlinMoESingle(
|
|||||||
int s_sh_wr = threadIdx.x;
|
int s_sh_wr = threadIdx.x;
|
||||||
bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
|
bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
|
||||||
|
|
||||||
|
// Zero-points
|
||||||
|
int zp_gl_rd;
|
||||||
|
if constexpr (has_zp) {
|
||||||
|
if constexpr (group_blocks == -1) {
|
||||||
|
zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
|
||||||
|
} else {
|
||||||
|
zp_gl_rd = zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
|
||||||
|
zp_sh_stride * slice_col + threadIdx.x;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int zp_sh_wr = threadIdx.x;
|
||||||
|
bool zp_sh_wr_pred = threadIdx.x < zp_sh_stride;
|
||||||
|
|
||||||
// We use a different scale layout for grouped and column-wise quantization as
|
// We use a different scale layout for grouped and column-wise quantization as
|
||||||
// we scale a `half2` tile in column-major layout in the former and in
|
// we scale a `half2` tile in column-major layout in the former and in
|
||||||
// row-major in the latter case.
|
// row-major in the latter case.
|
||||||
@@ -453,23 +520,29 @@ __device__ inline void MarlinMoESingle(
|
|||||||
s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
|
s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
|
||||||
(threadIdx.x % 32) % 4;
|
(threadIdx.x % 32) % 4;
|
||||||
|
|
||||||
|
// Zero-points have the same read layout as the scales
|
||||||
|
// (without column-wise case)
|
||||||
|
constexpr int num_col_threads = 8;
|
||||||
|
constexpr int num_row_threads = 4;
|
||||||
|
constexpr int num_ints_per_thread = 8 / pack_factor;
|
||||||
|
int zp_sh_rd;
|
||||||
|
if constexpr (has_zp) {
|
||||||
|
zp_sh_rd = num_ints_per_thread * num_col_threads *
|
||||||
|
((threadIdx.x / 32) % (thread_n_blocks / 4)) +
|
||||||
|
num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads);
|
||||||
|
}
|
||||||
|
|
||||||
int sh_first_group_id = -1;
|
int sh_first_group_id = -1;
|
||||||
int sh_num_groups = -1;
|
int sh_num_groups = -1;
|
||||||
constexpr int sh_max_num_groups = 32;
|
constexpr int sh_max_num_groups = 32;
|
||||||
|
|
||||||
int shs_size;
|
|
||||||
if constexpr (has_act_order)
|
|
||||||
shs_size = sh_max_num_groups * s_sh_stride + threads;
|
|
||||||
else
|
|
||||||
shs_size = group_blocks > 0 ? stages * s_sh_stage : threads;
|
|
||||||
|
|
||||||
extern __shared__ int4 sh[];
|
extern __shared__ int4 sh[];
|
||||||
// Shared memory storage for global fetch pipelines.
|
// Shared memory storage for global fetch pipelines.
|
||||||
int4* sh_a = sh;
|
int4* sh_a = sh;
|
||||||
int4* sh_b = sh_a + (stages * a_sh_stage);
|
int4* sh_b = sh_a + (stages * a_sh_stage);
|
||||||
int4* sh_g_idx = sh_b + (stages * b_sh_stage);
|
int4* sh_g_idx = sh_b + (stages * b_sh_stage);
|
||||||
int4* sh_s = sh_g_idx + (stages * g_idx_stage);
|
int4* sh_zp = sh_g_idx + (stages * g_idx_stage);
|
||||||
int* sh_sorted = (int*)(sh_s + shs_size);
|
int4* sh_s = sh_zp + (stages * zp_sh_stage);
|
||||||
|
|
||||||
// Precompute which thread should not read memory in which iterations; this is
|
// Precompute which thread should not read memory in which iterations; this is
|
||||||
// needed if there are more threads than required for a certain tilesize or
|
// needed if there are more threads than required for a certain tilesize or
|
||||||
@@ -525,8 +598,10 @@ __device__ inline void MarlinMoESingle(
|
|||||||
FragA frag_a[2][thread_m_blocks];
|
FragA frag_a[2][thread_m_blocks];
|
||||||
I4 frag_b_quant[2][b_thread_vecs];
|
I4 frag_b_quant[2][b_thread_vecs];
|
||||||
FragC frag_c[thread_m_blocks][4][2];
|
FragC frag_c[thread_m_blocks][4][2];
|
||||||
FragS frag_s[2][4]; // No act-order
|
FragS frag_s[2][4]; // No act-order
|
||||||
FragS act_frag_s[2][4][4]; // For act-order
|
FragS act_frag_s[2][4][4]; // For act-order
|
||||||
|
int frag_qzp[2][num_ints_per_thread]; // Zero-points
|
||||||
|
FragZP frag_zp; // Zero-points in fp16
|
||||||
|
|
||||||
// Zero accumulators.
|
// Zero accumulators.
|
||||||
auto zero_accums = [&]() {
|
auto zero_accums = [&]() {
|
||||||
@@ -633,6 +708,28 @@ __device__ inline void MarlinMoESingle(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if constexpr (has_zp && group_blocks != -1) {
|
||||||
|
int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
|
||||||
|
|
||||||
|
if constexpr (group_blocks >= thread_k_blocks) {
|
||||||
|
// Only fetch zero-points if this tile starts a new group
|
||||||
|
if (pipe % (group_blocks / thread_k_blocks) == 0) {
|
||||||
|
if (zp_sh_wr_pred) {
|
||||||
|
cp_async4(&sh_zp_stage[zp_sh_wr], &zp_ptr[zp_gl_rd]);
|
||||||
|
}
|
||||||
|
zp_gl_rd += zp_gl_rd_delta;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int i = 0; i < zp_tb_groups; i++) {
|
||||||
|
if (zp_sh_wr_pred) {
|
||||||
|
cp_async4(&sh_zp_stage[i * zp_sh_stride + zp_sh_wr],
|
||||||
|
&zp_ptr[zp_gl_rd]);
|
||||||
|
}
|
||||||
|
zp_gl_rd += zp_gl_rd_delta;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Insert a fence even when we are winding down the pipeline to ensure that
|
// Insert a fence even when we are winding down the pipeline to ensure that
|
||||||
@@ -640,15 +737,9 @@ __device__ inline void MarlinMoESingle(
|
|||||||
cp_async_fence();
|
cp_async_fence();
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO we are currently hitting illegal memory accesses when fetching
|
auto fetch_zp_to_shared = [&]() {
|
||||||
// sorted_ids to shared data: fix this
|
if (zp_sh_wr_pred) {
|
||||||
auto fetch_sorted_ids_to_shared = [&]() {
|
cp_async4(&sh_zp[zp_sh_wr], &zp_ptr[zp_gl_rd]);
|
||||||
const int mpt = ceildiv(prob_m, threads);
|
|
||||||
for (int i = 0; i < mpt; i++) {
|
|
||||||
if ((i * sorted_gl_stride) + threadIdx.x < prob_m) {
|
|
||||||
sh_sorted[(i * sorted_sh_stride) + threadIdx.x] =
|
|
||||||
sorted_ids[(i * sorted_gl_stride) + threadIdx.x];
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -799,8 +890,83 @@ __device__ inline void MarlinMoESingle(
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
auto fetch_zp_to_registers = [&](int k, int full_pipe) {
|
||||||
|
// This code does not handle group_blocks == 0,
|
||||||
|
// which signifies act_order.
|
||||||
|
// has_zp implies AWQ, which doesn't have act_order,
|
||||||
|
static_assert(!has_zp || group_blocks != 0);
|
||||||
|
|
||||||
|
if constexpr (has_zp) {
|
||||||
|
int pipe = full_pipe % stages;
|
||||||
|
|
||||||
|
if constexpr (group_blocks == -1) {
|
||||||
|
for (int i = 0; i < num_ints_per_thread; i++) {
|
||||||
|
frag_qzp[k % 2][i] = (reinterpret_cast<int*>(sh_zp))[zp_sh_rd + i];
|
||||||
|
}
|
||||||
|
|
||||||
|
} else if constexpr (group_blocks >= thread_k_blocks) {
|
||||||
|
int4* sh_zp_stage =
|
||||||
|
sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) *
|
||||||
|
(pipe / (group_blocks / thread_k_blocks)));
|
||||||
|
for (int i = 0; i < num_ints_per_thread; i++) {
|
||||||
|
frag_qzp[k % 2][i] =
|
||||||
|
(reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
int warp_id = threadIdx.x / 32;
|
||||||
|
int n_warps = thread_n_blocks / 4;
|
||||||
|
|
||||||
|
int warp_row = warp_id / n_warps;
|
||||||
|
|
||||||
|
int cur_k = warp_row * 16;
|
||||||
|
cur_k += k_iter_size * (k % b_sh_wr_iters);
|
||||||
|
|
||||||
|
int k_blocks = cur_k / 16;
|
||||||
|
int cur_group_id = 0;
|
||||||
|
|
||||||
|
// Suppress bogus and persistent divide-by-zero warning
|
||||||
|
#pragma nv_diagnostic push
|
||||||
|
#pragma nv_diag_suppress divide_by_zero
|
||||||
|
cur_group_id = k_blocks / group_blocks;
|
||||||
|
#pragma nv_diagnostic pop
|
||||||
|
|
||||||
|
int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
|
||||||
|
|
||||||
|
sh_zp_stage += cur_group_id * zp_sh_stride;
|
||||||
|
|
||||||
|
for (int i = 0; i < num_ints_per_thread; i++) {
|
||||||
|
frag_qzp[k % 2][i] =
|
||||||
|
(reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
// Execute the actual tensor core matmul of a sub-tile.
|
// Execute the actual tensor core matmul of a sub-tile.
|
||||||
auto matmul = [&](int k) {
|
auto matmul = [&](int k) {
|
||||||
|
if constexpr (has_zp) {
|
||||||
|
FragB frag_zp_0;
|
||||||
|
FragB frag_zp_1;
|
||||||
|
int zp_quant_0, zp_quant_1;
|
||||||
|
|
||||||
|
if constexpr (w_type.size_bits() == 4) {
|
||||||
|
zp_quant_0 = frag_qzp[k % 2][0];
|
||||||
|
zp_quant_1 = zp_quant_0 >> 8;
|
||||||
|
} else {
|
||||||
|
static_assert(w_type.size_bits() == 8);
|
||||||
|
zp_quant_0 = frag_qzp[k % 2][0];
|
||||||
|
zp_quant_1 = frag_qzp[k % 2][1];
|
||||||
|
}
|
||||||
|
|
||||||
|
frag_zp_0 = dequant<w_type_id>(zp_quant_0);
|
||||||
|
frag_zp_1 = dequant<w_type_id>(zp_quant_1);
|
||||||
|
|
||||||
|
frag_zp[0] = frag_zp_0[0];
|
||||||
|
frag_zp[1] = frag_zp_0[1];
|
||||||
|
frag_zp[2] = frag_zp_1[0];
|
||||||
|
frag_zp[3] = frag_zp_1[1];
|
||||||
|
}
|
||||||
|
|
||||||
// We have the m dimension as the inner loop in order to encourage overlapping
|
// We have the m dimension as the inner loop in order to encourage overlapping
|
||||||
// dequantization and matmul operations.
|
// dequantization and matmul operations.
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
@@ -818,6 +984,10 @@ __device__ inline void MarlinMoESingle(
|
|||||||
|
|
||||||
FragB frag_b0 = dequant<w_type_id>(b_quant_0);
|
FragB frag_b0 = dequant<w_type_id>(b_quant_0);
|
||||||
FragB frag_b1 = dequant<w_type_id>(b_quant_1);
|
FragB frag_b1 = dequant<w_type_id>(b_quant_1);
|
||||||
|
// Apply zero-point to frag_b0
|
||||||
|
if constexpr (has_zp) {
|
||||||
|
sub_zp(frag_b0, frag_zp[j], 0);
|
||||||
|
}
|
||||||
|
|
||||||
// Apply scale to frag_b0
|
// Apply scale to frag_b0
|
||||||
if constexpr (has_act_order) {
|
if constexpr (has_act_order) {
|
||||||
@@ -829,6 +999,11 @@ __device__ inline void MarlinMoESingle(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Apply zero-point to frag_b1
|
||||||
|
if constexpr (has_zp) {
|
||||||
|
sub_zp(frag_b1, frag_zp[j], 1);
|
||||||
|
}
|
||||||
|
|
||||||
// Apply scale to frag_b1
|
// Apply scale to frag_b1
|
||||||
if constexpr (has_act_order) {
|
if constexpr (has_act_order) {
|
||||||
scale4(frag_b1, act_frag_s[k % 2][0][j], act_frag_s[k % 2][1][j],
|
scale4(frag_b1, act_frag_s[k % 2][0][j], act_frag_s[k % 2][1][j],
|
||||||
@@ -1062,9 +1237,6 @@ __device__ inline void MarlinMoESingle(
|
|||||||
|
|
||||||
// Start global fetch and register load pipelines.
|
// Start global fetch and register load pipelines.
|
||||||
auto start_pipes = [&]() {
|
auto start_pipes = [&]() {
|
||||||
// TODO re-enable after fixing this function
|
|
||||||
// fetch_sorted_ids_to_shared();
|
|
||||||
// __syncthreads();
|
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < stages - 1; i++) {
|
for (int i = 0; i < stages - 1; i++) {
|
||||||
@@ -1075,6 +1247,12 @@ __device__ inline void MarlinMoESingle(
|
|||||||
}
|
}
|
||||||
fetch_scales_to_shared(true, g_idx[slice_k_start], g_idx[last_g_idx]);
|
fetch_scales_to_shared(true, g_idx[slice_k_start], g_idx[last_g_idx]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if constexpr (has_zp && group_blocks == -1) {
|
||||||
|
if (i == 0) {
|
||||||
|
fetch_zp_to_shared();
|
||||||
|
}
|
||||||
|
}
|
||||||
fetch_to_shared(i, i, i < slice_iters);
|
fetch_to_shared(i, i, i < slice_iters);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1083,6 +1261,7 @@ __device__ inline void MarlinMoESingle(
|
|||||||
init_same_group(0);
|
init_same_group(0);
|
||||||
fetch_to_registers(0, 0);
|
fetch_to_registers(0, 0);
|
||||||
fetch_scales_to_registers(0, 0);
|
fetch_scales_to_registers(0, 0);
|
||||||
|
fetch_zp_to_registers(0, 0);
|
||||||
a_gl_rd += a_gl_rd_delta_o * (stages - 1);
|
a_gl_rd += a_gl_rd_delta_o * (stages - 1);
|
||||||
slice_k_start_shared_fetch += tb_k * (stages - 1);
|
slice_k_start_shared_fetch += tb_k * (stages - 1);
|
||||||
};
|
};
|
||||||
@@ -1102,6 +1281,7 @@ __device__ inline void MarlinMoESingle(
|
|||||||
for (int k = 0; k < b_sh_wr_iters; k++) {
|
for (int k = 0; k < b_sh_wr_iters; k++) {
|
||||||
fetch_to_registers(k + 1, pipe % stages);
|
fetch_to_registers(k + 1, pipe % stages);
|
||||||
fetch_scales_to_registers(k + 1, pipe);
|
fetch_scales_to_registers(k + 1, pipe);
|
||||||
|
fetch_zp_to_registers(k + 1, pipe);
|
||||||
if (k == b_sh_wr_iters - 2) {
|
if (k == b_sh_wr_iters - 2) {
|
||||||
fetch_to_shared((pipe + stages - 1) % stages, pipe,
|
fetch_to_shared((pipe + stages - 1) % stages, pipe,
|
||||||
slice_iters >= stages);
|
slice_iters >= stages);
|
||||||
@@ -1236,7 +1416,9 @@ __device__ inline void MarlinMoESingle(
|
|||||||
|
|
||||||
} else {
|
} else {
|
||||||
s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
|
s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
|
||||||
|
zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
|
||||||
}
|
}
|
||||||
|
|
||||||
start_pipes();
|
start_pipes();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1250,6 +1432,7 @@ template <const vllm::ScalarTypeId w_type_id, // weight ScalarType id
|
|||||||
const int stages, // number of stages for the async global->shared
|
const int stages, // number of stages for the async global->shared
|
||||||
// fetch pipeline
|
// fetch pipeline
|
||||||
const bool has_act_order, // whether act_order is enabled
|
const bool has_act_order, // whether act_order is enabled
|
||||||
|
const bool has_zp, // whether zero-points are enabled
|
||||||
const int group_blocks = -1 // number of consecutive 16x16 blocks
|
const int group_blocks = -1 // number of consecutive 16x16 blocks
|
||||||
// with a separate quantization scale
|
// with a separate quantization scale
|
||||||
>
|
>
|
||||||
@@ -1261,6 +1444,8 @@ __global__ void MarlinMoE(
|
|||||||
const float* __restrict__ topk_weights, // float topk weights
|
const float* __restrict__ topk_weights, // float topk weights
|
||||||
const int4* __restrict__ scales_ptr, // fp16 quantization scales of shape
|
const int4* __restrict__ scales_ptr, // fp16 quantization scales of shape
|
||||||
// (k/groupsize)xn
|
// (k/groupsize)xn
|
||||||
|
const int4* __restrict__ zp_ptr, // 4bit packed zero-points of shape
|
||||||
|
// (k/groupsize)x(n/pack_factor)
|
||||||
const int* __restrict__ g_idx, // int32 group indices of shape k
|
const int* __restrict__ g_idx, // int32 group indices of shape k
|
||||||
const int* __restrict__ expert_offsets,
|
const int* __restrict__ expert_offsets,
|
||||||
int num_groups, // number of scale groups per output channel
|
int num_groups, // number of scale groups per output channel
|
||||||
@@ -1309,29 +1494,29 @@ __global__ void MarlinMoE(
|
|||||||
|
|
||||||
if (max_block == 1) {
|
if (max_block == 1) {
|
||||||
MarlinMoESingle<w_type_id, threads, 1, thread_n_blocks, thread_k_blocks,
|
MarlinMoESingle<w_type_id, threads, 1, thread_n_blocks, thread_k_blocks,
|
||||||
stages, has_act_order, group_blocks>(
|
stages, has_act_order, has_zp, group_blocks>(
|
||||||
A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
|
A, B, C, sorted_ids_expert, topk_weights, scales_ptr, zp_ptr, g_idx,
|
||||||
expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
|
expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
|
||||||
prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
|
prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
|
||||||
current_m_block);
|
current_m_block);
|
||||||
} else if (max_block == 2) {
|
} else if (max_block == 2) {
|
||||||
MarlinMoESingle<w_type_id, threads, 2, thread_n_blocks, thread_k_blocks,
|
MarlinMoESingle<w_type_id, threads, 2, thread_n_blocks, thread_k_blocks,
|
||||||
stages, has_act_order, group_blocks>(
|
stages, has_act_order, has_zp, group_blocks>(
|
||||||
A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
|
A, B, C, sorted_ids_expert, topk_weights, scales_ptr, zp_ptr, g_idx,
|
||||||
expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
|
expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
|
||||||
prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
|
prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
|
||||||
current_m_block);
|
current_m_block);
|
||||||
} else if (max_block == 3) {
|
} else if (max_block == 3) {
|
||||||
MarlinMoESingle<w_type_id, threads, 3, thread_n_blocks, thread_k_blocks,
|
MarlinMoESingle<w_type_id, threads, 3, thread_n_blocks, thread_k_blocks,
|
||||||
stages, has_act_order, group_blocks>(
|
stages, has_act_order, has_zp, group_blocks>(
|
||||||
A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
|
A, B, C, sorted_ids_expert, topk_weights, scales_ptr, zp_ptr, g_idx,
|
||||||
expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
|
expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
|
||||||
prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
|
prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
|
||||||
current_m_block);
|
current_m_block);
|
||||||
} else {
|
} else {
|
||||||
MarlinMoESingle<w_type_id, threads, 4, thread_n_blocks, thread_k_blocks,
|
MarlinMoESingle<w_type_id, threads, 4, thread_n_blocks, thread_k_blocks,
|
||||||
stages, has_act_order, group_blocks>(
|
stages, has_act_order, has_zp, group_blocks>(
|
||||||
A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
|
A, B, C, sorted_ids_expert, topk_weights, scales_ptr, zp_ptr, g_idx,
|
||||||
expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
|
expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
|
||||||
prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
|
prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
|
||||||
current_m_block);
|
current_m_block);
|
||||||
@@ -1347,6 +1532,7 @@ template <const vllm::ScalarTypeId w_type_id, // weight ScalarType id
|
|||||||
const int stages, // number of stages for the async global->shared
|
const int stages, // number of stages for the async global->shared
|
||||||
// fetch pipeline
|
// fetch pipeline
|
||||||
const bool has_act_order, // whether act_order is enabled
|
const bool has_act_order, // whether act_order is enabled
|
||||||
|
const bool has_zp, // whether zero-points are enabled
|
||||||
const int group_blocks = -1 // number of consecutive 16x16 blocks
|
const int group_blocks = -1 // number of consecutive 16x16 blocks
|
||||||
// with a separate quantization scale
|
// with a separate quantization scale
|
||||||
>
|
>
|
||||||
@@ -1358,6 +1544,8 @@ __global__ void MarlinMoE(
|
|||||||
const float* __restrict__ topk_weights, // float topk weights
|
const float* __restrict__ topk_weights, // float topk weights
|
||||||
const int4* __restrict__ scales_ptr, // fp16 quantization scales of shape
|
const int4* __restrict__ scales_ptr, // fp16 quantization scales of shape
|
||||||
// (k/groupsize)xn
|
// (k/groupsize)xn
|
||||||
|
const int4* __restrict__ zp_ptr, // 4bit packed zero-points of shape
|
||||||
|
// (k/groupsize)x(n/pack_factor)
|
||||||
const int* __restrict__ g_idx, // int32 group indices of shape k
|
const int* __restrict__ g_idx, // int32 group indices of shape k
|
||||||
const int* __restrict__ expert_offsets,
|
const int* __restrict__ expert_offsets,
|
||||||
int num_groups, // number of scale groups per output channel
|
int num_groups, // number of scale groups per output channel
|
||||||
@@ -1374,7 +1562,6 @@ __global__ void MarlinMoE(
|
|||||||
int current_m_block, // current m block to start kernel computation from
|
int current_m_block, // current m block to start kernel computation from
|
||||||
int max_par, // maximum parallelism
|
int max_par, // maximum parallelism
|
||||||
int cfg_max_m_blocks // upper bound on m blocks
|
int cfg_max_m_blocks // upper bound on m blocks
|
||||||
|
|
||||||
) {
|
) {
|
||||||
// Marlin is not implemented yet for SM < 8.0
|
// Marlin is not implemented yet for SM < 8.0
|
||||||
assert(false);
|
assert(false);
|
||||||
@@ -1389,37 +1576,41 @@ __global__ void MarlinMoE(
|
|||||||
const int USER_THREADS =
|
const int USER_THREADS =
|
||||||
256; // Note: This is only used with user-provided thread_k/n
|
256; // Note: This is only used with user-provided thread_k/n
|
||||||
const int STAGES = 4; // 4 pipeline stages fit into shared memory
|
const int STAGES = 4; // 4 pipeline stages fit into shared memory
|
||||||
// const int SHARED_MEM =
|
|
||||||
// 96 * 1024; // max shared memory on compute capability 8.6 (< 8.0)
|
|
||||||
|
|
||||||
static constexpr int min_thread_n = 64;
|
static constexpr int min_thread_n = 64;
|
||||||
static constexpr int min_thread_k = 64;
|
static constexpr int min_thread_k = 64;
|
||||||
|
|
||||||
#define __CALL_IF_MOE(W_TYPE, THREAD_N_BLOCKS, THREAD_K_BLOCKS, HAS_ACT_ORDER, \
|
#define __CALL_IF_MOE(W_TYPE, THREAD_N_BLOCKS, THREAD_K_BLOCKS, HAS_ACT_ORDER, \
|
||||||
GROUP_BLOCKS, NUM_THREADS) \
|
HAS_ZP, GROUP_BLOCKS, NUM_THREADS) \
|
||||||
else if (q_type == W_TYPE && thread_n_blocks == THREAD_N_BLOCKS && \
|
else if (q_type == W_TYPE && thread_n_blocks == THREAD_N_BLOCKS && \
|
||||||
thread_k_blocks == THREAD_K_BLOCKS && \
|
thread_k_blocks == THREAD_K_BLOCKS && \
|
||||||
has_act_order == HAS_ACT_ORDER && group_blocks == GROUP_BLOCKS && \
|
has_act_order == HAS_ACT_ORDER && has_zp == HAS_ZP && \
|
||||||
num_threads == NUM_THREADS) { \
|
group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) { \
|
||||||
cudaFuncSetAttribute( \
|
cudaFuncSetAttribute( \
|
||||||
MarlinMoE<W_TYPE.id(), NUM_THREADS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
|
MarlinMoE<W_TYPE.id(), NUM_THREADS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
|
||||||
STAGES, HAS_ACT_ORDER, GROUP_BLOCKS>, \
|
STAGES, HAS_ACT_ORDER, HAS_ZP, GROUP_BLOCKS>, \
|
||||||
cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem); \
|
cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem); \
|
||||||
MarlinMoE<W_TYPE.id(), NUM_THREADS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
|
MarlinMoE<W_TYPE.id(), NUM_THREADS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
|
||||||
STAGES, HAS_ACT_ORDER, GROUP_BLOCKS> \
|
STAGES, HAS_ACT_ORDER, HAS_ZP, GROUP_BLOCKS> \
|
||||||
<<<blocks, NUM_THREADS, max_shared_mem, stream>>>( \
|
<<<blocks, NUM_THREADS, max_shared_mem, stream>>>( \
|
||||||
A_ptr, B_ptr, C_ptr, sorted_ids_ptr, topk_weights_ptr, s_ptr, \
|
A_ptr, B_ptr, C_ptr, sorted_ids_ptr, topk_weights_ptr, s_ptr, \
|
||||||
g_idx_ptr, expert_offsets_ptr, num_groups, expert_idx, \
|
zp_ptr, g_idx_ptr, expert_offsets_ptr, num_groups, expert_idx, \
|
||||||
num_experts, topk, prob_m, prob_n, prob_k, tot_m, locks, \
|
num_experts, topk, prob_m, prob_n, prob_k, tot_m, locks, \
|
||||||
replicate_input, apply_weights, m_block, max_par, \
|
replicate_input, apply_weights, m_block, max_par, \
|
||||||
cfg_max_m_blocks); \
|
cfg_max_m_blocks); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define GPTQ_CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
|
#define GPTQ_CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
|
||||||
__CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS) \
|
__CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS) \
|
||||||
__CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
|
__CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS) \
|
||||||
__CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS) \
|
__CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS) \
|
||||||
__CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS) \
|
__CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS) \
|
||||||
__CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)
|
__CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS)
|
||||||
|
|
||||||
|
#define AWQ_CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
|
||||||
|
__CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS) \
|
||||||
|
__CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS) \
|
||||||
|
__CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS) \
|
||||||
|
__CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS)
|
||||||
|
|
||||||
} // namespace marlin_moe
|
} // namespace marlin_moe
|
||||||
|
|||||||
31
csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu
Normal file
31
csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
#include "marlin_moe_kernel_ku4.h"
|
||||||
|
|
||||||
|
namespace marlin_moe {
|
||||||
|
|
||||||
|
// We return bool so we can create these different kernel calls as a sequence
|
||||||
|
// of if-elseif's.
|
||||||
|
bool call_marlin_moe_kernel_ku4(
|
||||||
|
vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
|
||||||
|
bool has_act_order, int group_blocks, int num_threads, int blocks,
|
||||||
|
int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
|
||||||
|
const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
|
||||||
|
const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
|
||||||
|
const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
|
||||||
|
int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
|
||||||
|
int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
|
||||||
|
int m_block, int max_par, int cfg_max_m_blocks) {
|
||||||
|
bool has_zp = true;
|
||||||
|
|
||||||
|
if (false) {
|
||||||
|
}
|
||||||
|
AWQ_CALL_IF_MOE(vllm::kU4, 16, 4, 256)
|
||||||
|
AWQ_CALL_IF_MOE(vllm::kU4, 8, 8, 256)
|
||||||
|
AWQ_CALL_IF_MOE(vllm::kU4, 8, 4, 128)
|
||||||
|
AWQ_CALL_IF_MOE(vllm::kU4, 4, 8, 128)
|
||||||
|
else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace marlin_moe
|
||||||
20
csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h
Normal file
20
csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "marlin_moe_kernel.h"
|
||||||
|
|
||||||
|
namespace marlin_moe {
|
||||||
|
|
||||||
|
// We return bool so we can create these different kernel calls as a sequence
|
||||||
|
// of if-elseif's.
|
||||||
|
bool call_marlin_moe_kernel_ku4(
|
||||||
|
vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
|
||||||
|
bool has_act_order, int group_blocks, int num_threads, int blocks,
|
||||||
|
int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
|
||||||
|
const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
|
||||||
|
const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
|
||||||
|
const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
|
||||||
|
int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
|
||||||
|
int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
|
||||||
|
int m_block, int max_par, int cfg_max_m_blocks);
|
||||||
|
|
||||||
|
} // namespace marlin_moe
|
||||||
@@ -9,11 +9,13 @@ bool call_marlin_moe_kernel_ku4b8(
|
|||||||
bool has_act_order, int group_blocks, int num_threads, int blocks,
|
bool has_act_order, int group_blocks, int num_threads, int blocks,
|
||||||
int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
|
int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
|
||||||
const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
|
const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
|
||||||
const float* topk_weights_ptr, const int4* s_ptr, const int* g_idx_ptr,
|
const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
|
||||||
int* expert_offsets_ptr, int num_groups, int expert_idx, int num_experts,
|
const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
|
||||||
int topk, int prob_m, int prob_n, int prob_k, int tot_m, int* locks,
|
int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
|
||||||
bool replicate_input, bool apply_weights, int m_block, int max_par,
|
int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
|
||||||
int cfg_max_m_blocks) {
|
int m_block, int max_par, int cfg_max_m_blocks) {
|
||||||
|
bool has_zp = false;
|
||||||
|
|
||||||
if (false) {
|
if (false) {
|
||||||
}
|
}
|
||||||
GPTQ_CALL_IF_MOE(vllm::kU4B8, 16, 4, 256)
|
GPTQ_CALL_IF_MOE(vllm::kU4B8, 16, 4, 256)
|
||||||
|
|||||||
@@ -11,10 +11,10 @@ bool call_marlin_moe_kernel_ku4b8(
|
|||||||
bool has_act_order, int group_blocks, int num_threads, int blocks,
|
bool has_act_order, int group_blocks, int num_threads, int blocks,
|
||||||
int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
|
int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
|
||||||
const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
|
const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
|
||||||
const float* topk_weights_ptr, const int4* s_ptr, const int* g_idx_ptr,
|
const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
|
||||||
int* expert_offsets_ptr, int num_groups, int expert_idx, int num_experts,
|
const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
|
||||||
int topk, int prob_m, int prob_n, int prob_k, int tot_m, int* locks,
|
int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
|
||||||
bool replicate_input, bool apply_weights, int m_block, int max_par,
|
int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
|
||||||
int cfg_max_m_blocks);
|
int m_block, int max_par, int cfg_max_m_blocks);
|
||||||
|
|
||||||
} // namespace marlin_moe
|
} // namespace marlin_moe
|
||||||
|
|||||||
@@ -9,11 +9,13 @@ bool call_marlin_moe_kernel_ku8b128(
|
|||||||
bool has_act_order, int group_blocks, int num_threads, int blocks,
|
bool has_act_order, int group_blocks, int num_threads, int blocks,
|
||||||
int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
|
int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
|
||||||
const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
|
const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
|
||||||
const float* topk_weights_ptr, const int4* s_ptr, const int* g_idx_ptr,
|
const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
|
||||||
int* expert_offsets_ptr, int num_groups, int expert_idx, int num_experts,
|
const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
|
||||||
int topk, int prob_m, int prob_n, int prob_k, int tot_m, int* locks,
|
int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
|
||||||
bool replicate_input, bool apply_weights, int m_block, int max_par,
|
int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
|
||||||
int cfg_max_m_blocks) {
|
int m_block, int max_par, int cfg_max_m_blocks) {
|
||||||
|
bool has_zp = false;
|
||||||
|
|
||||||
if (false) {
|
if (false) {
|
||||||
}
|
}
|
||||||
GPTQ_CALL_IF_MOE(vllm::kU8B128, 16, 4, 256)
|
GPTQ_CALL_IF_MOE(vllm::kU8B128, 16, 4, 256)
|
||||||
|
|||||||
@@ -9,10 +9,10 @@ bool call_marlin_moe_kernel_ku8b128(
|
|||||||
bool has_act_order, int group_blocks, int num_threads, int blocks,
|
bool has_act_order, int group_blocks, int num_threads, int blocks,
|
||||||
int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
|
int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
|
||||||
const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
|
const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
|
||||||
const float* topk_weights_ptr, const int4* s_ptr, const int* g_idx_ptr,
|
const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
|
||||||
int* expert_offsets_ptr, int num_groups, int expert_idx, int num_experts,
|
const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
|
||||||
int topk, int prob_m, int prob_n, int prob_k, int tot_m, int* locks,
|
int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
|
||||||
bool replicate_input, bool apply_weights, int m_block, int max_par,
|
int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
|
||||||
int cfg_max_m_blocks);
|
int m_block, int max_par, int cfg_max_m_blocks);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -25,9 +25,12 @@
|
|||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
|
#include "core/exception.hpp"
|
||||||
#include "core/scalar_type.hpp"
|
#include "core/scalar_type.hpp"
|
||||||
|
#include "core/registration.h"
|
||||||
#include "marlin_kernels/marlin_moe_kernel_ku4b8.h"
|
#include "marlin_kernels/marlin_moe_kernel_ku4b8.h"
|
||||||
#include "marlin_kernels/marlin_moe_kernel_ku8b128.h"
|
#include "marlin_kernels/marlin_moe_kernel_ku8b128.h"
|
||||||
|
#include "marlin_kernels/marlin_moe_kernel_ku4.h"
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
inline std::string str(T x) {
|
inline std::string str(T x) {
|
||||||
@@ -155,6 +158,7 @@ thread_config_t small_batch_thread_configs[] = {
|
|||||||
{128, 64, 128}, // Reduce N 2X, same K
|
{128, 64, 128}, // Reduce N 2X, same K
|
||||||
{64, 256, 256}, // Reduce K 2X, increase N 2X
|
{64, 256, 256}, // Reduce K 2X, increase N 2X
|
||||||
{64, 128, 128}, // Reduce K 2X, same N
|
{64, 128, 128}, // Reduce K 2X, same N
|
||||||
|
{64, 64, 128}, // Reduce both 2X
|
||||||
};
|
};
|
||||||
|
|
||||||
thread_config_t large_batch_thread_configs[] = {
|
thread_config_t large_batch_thread_configs[] = {
|
||||||
@@ -165,6 +169,7 @@ thread_config_t large_batch_thread_configs[] = {
|
|||||||
{128, 128, 256}, // Reduce N 2X, increase K 2X
|
{128, 128, 256}, // Reduce N 2X, increase K 2X
|
||||||
{64, 128, 128}, // Reduce N 2X, same K
|
{64, 128, 128}, // Reduce N 2X, same K
|
||||||
{128, 64, 128}, // Reduce N 4X, increase K 2X
|
{128, 64, 128}, // Reduce N 4X, increase K 2X
|
||||||
|
{64, 64, 128}, // Reduce N 4X, same K
|
||||||
};
|
};
|
||||||
|
|
||||||
int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
|
int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
|
||||||
@@ -189,7 +194,7 @@ int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
|
|||||||
int load_groups =
|
int load_groups =
|
||||||
tb_groups * STAGES * 2; // Chunk size is 2x pipeline over dim K
|
tb_groups * STAGES * 2; // Chunk size is 2x pipeline over dim K
|
||||||
load_groups = max(load_groups, 32); // We load at least 32 scale groups
|
load_groups = max(load_groups, 32); // We load at least 32 scale groups
|
||||||
return load_groups * tb_n * 2;
|
return load_groups * tb_n * 4;
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
int tb_scales = tb_groups * tb_n * 2;
|
int tb_scales = tb_groups * tb_n * 2;
|
||||||
@@ -310,27 +315,28 @@ exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k,
|
|||||||
return exec_config_t{0, {-1, -1, -1}};
|
return exec_config_t{0, {-1, -1, -1}};
|
||||||
}
|
}
|
||||||
|
|
||||||
#define CALL_MOE_KERNEL_FUNCTION(KERNEL_FUNCTION) \
|
#define CALL_MOE_KERNEL_FUNCTION(KERNEL_FUNCTION) \
|
||||||
else if (KERNEL_FUNCTION(q_type, thread_n_blocks, thread_k_blocks, \
|
else if (KERNEL_FUNCTION( \
|
||||||
has_act_order, group_blocks, num_threads, blocks, \
|
q_type, thread_n_blocks, thread_k_blocks, has_act_order, \
|
||||||
max_shared_mem, stream, A_ptr, B_ptr, C_ptr, \
|
group_blocks, num_threads, blocks, max_shared_mem, stream, \
|
||||||
sorted_ids_ptr, topk_weights_ptr, s_ptr, g_idx_ptr, \
|
A_ptr, B_ptr, C_ptr, sorted_ids_ptr, topk_weights_ptr, s_ptr, \
|
||||||
expert_offsets_ptr, num_groups, expert_idx, \
|
zp_ptr, g_idx_ptr, expert_offsets_ptr, num_groups, expert_idx, \
|
||||||
num_experts, topk, prob_m, prob_n, prob_k, tot_m, \
|
num_experts, topk, prob_m, prob_n, prob_k, tot_m, locks, \
|
||||||
locks, replicate_input, apply_weights, m_block, \
|
replicate_input, apply_weights, m_block, max_par, \
|
||||||
max_par, exec_cfg.max_m_blocks)) { \
|
exec_cfg.max_m_blocks)) { \
|
||||||
}
|
}
|
||||||
|
|
||||||
void marlin_mm_moe(const void* A, const void* B, void* C,
|
void marlin_mm_moe(const void* A, const void* B, void* C,
|
||||||
const void* sorted_ids, const void* topk_weights,
|
const void* sorted_ids, const void* topk_weights,
|
||||||
const void* topk_ids, const void* s, const void* g_idx,
|
const void* topk_ids, const void* s, void* zp,
|
||||||
const void* perm, void* a_tmp, void* expert_offsets,
|
const void* g_idx, const void* perm, void* a_tmp,
|
||||||
int prob_m, int prob_n, int prob_k, void* workspace,
|
void* expert_offsets, int prob_m, int prob_n, int prob_k,
|
||||||
vllm::ScalarType const& q_type, bool has_act_order,
|
void* workspace, vllm::ScalarType const& q_type,
|
||||||
bool is_k_full, int num_groups, int group_size,
|
bool has_act_order, bool is_k_full, bool has_zp,
|
||||||
int num_experts, int topk, int moe_block_size, int dev,
|
int num_groups, int group_size, int num_experts, int topk,
|
||||||
cudaStream_t stream, int thread_k, int thread_n, int sms,
|
int moe_block_size, int dev, cudaStream_t stream,
|
||||||
int max_par, bool replicate_input, bool apply_weights) {
|
int thread_k, int thread_n, int sms, int max_par,
|
||||||
|
bool replicate_input, bool apply_weights) {
|
||||||
TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
|
TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
|
||||||
", ", prob_n, ", ", prob_k, "]");
|
", ", prob_n, ", ", prob_k, "]");
|
||||||
|
|
||||||
@@ -433,11 +439,9 @@ void marlin_mm_moe(const void* A, const void* B, void* C,
|
|||||||
int4* C_ptr = (int4*)C;
|
int4* C_ptr = (int4*)C;
|
||||||
const float* topk_weights_ptr = (const float*)topk_weights;
|
const float* topk_weights_ptr = (const float*)topk_weights;
|
||||||
const int* sorted_ids_ptr = (const int*)sorted_ids;
|
const int* sorted_ids_ptr = (const int*)sorted_ids;
|
||||||
const int4* s_ptr =
|
const int4* s_ptr = (const int4*)s + num_groups * prob_n / 8 * expert_idx;
|
||||||
(const int4*)s +
|
const int4* zp_ptr =
|
||||||
(((group_size == -1 || group_size == 0) ? 1 : prob_k / group_size) *
|
(const int4*)zp + num_groups * prob_n / (pack_factor * 4) * expert_idx;
|
||||||
prob_n / 8) *
|
|
||||||
expert_idx;
|
|
||||||
const int* g_idx_ptr = (const int*)g_idx + prob_k * expert_idx;
|
const int* g_idx_ptr = (const int*)g_idx + prob_k * expert_idx;
|
||||||
const int* perm_ptr = (const int*)perm + prob_k * expert_idx;
|
const int* perm_ptr = (const int*)perm + prob_k * expert_idx;
|
||||||
int* locks = (int*)workspace;
|
int* locks = (int*)workspace;
|
||||||
@@ -458,6 +462,7 @@ void marlin_mm_moe(const void* A, const void* B, void* C,
|
|||||||
}
|
}
|
||||||
CALL_MOE_KERNEL_FUNCTION(call_marlin_moe_kernel_ku4b8)
|
CALL_MOE_KERNEL_FUNCTION(call_marlin_moe_kernel_ku4b8)
|
||||||
CALL_MOE_KERNEL_FUNCTION(call_marlin_moe_kernel_ku8b128)
|
CALL_MOE_KERNEL_FUNCTION(call_marlin_moe_kernel_ku8b128)
|
||||||
|
CALL_MOE_KERNEL_FUNCTION(call_marlin_moe_kernel_ku4)
|
||||||
else {
|
else {
|
||||||
TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " +
|
TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " +
|
||||||
str(prob_n) + ", " + str(prob_k) + "]" +
|
str(prob_n) + ", " + str(prob_k) + "]" +
|
||||||
@@ -477,13 +482,21 @@ torch::Tensor marlin_gemm_moe(
|
|||||||
const torch::Tensor& a, const torch::Tensor& b_q_weights,
|
const torch::Tensor& a, const torch::Tensor& b_q_weights,
|
||||||
const torch::Tensor& sorted_ids, const torch::Tensor& topk_weights,
|
const torch::Tensor& sorted_ids, const torch::Tensor& topk_weights,
|
||||||
const torch::Tensor& topk_ids, const torch::Tensor& b_scales,
|
const torch::Tensor& topk_ids, const torch::Tensor& b_scales,
|
||||||
const torch::Tensor& g_idx, const torch::Tensor& perm,
|
torch::Tensor& b_zeros, const torch::Tensor& g_idx,
|
||||||
torch::Tensor& workspace, vllm::ScalarTypeTorchPtr const& b_q_type,
|
const torch::Tensor& perm, torch::Tensor& workspace,
|
||||||
int64_t size_m, int64_t size_n, int64_t size_k, bool is_k_full,
|
vllm::ScalarTypeTorchPtr const& b_q_type, int64_t size_m, int64_t size_n,
|
||||||
int64_t num_experts, int64_t topk, int64_t moe_block_size,
|
int64_t size_k, bool is_k_full, int64_t num_experts, int64_t topk,
|
||||||
bool replicate_input, bool apply_weights) {
|
int64_t moe_block_size, bool replicate_input, bool apply_weights) {
|
||||||
TORCH_CHECK(*b_q_type == vllm::kU4B8 || *b_q_type == vllm::kU8B128,
|
bool has_zp = b_zeros.size(1) != 0;
|
||||||
"b_q_type must be uint4b8 or uint8b128. Got = ", b_q_type->str());
|
if (has_zp) {
|
||||||
|
TORCH_CHECK(
|
||||||
|
*b_q_type == vllm::kU4,
|
||||||
|
"b_q_type must be u4 when has_zp = True. Got = ", b_q_type->str());
|
||||||
|
} else {
|
||||||
|
TORCH_CHECK(
|
||||||
|
*b_q_type == vllm::kU4B8 || *b_q_type == vllm::kU8B128,
|
||||||
|
"b_q_type must be uint4b8 or uint8b128. Got = ", b_q_type->str());
|
||||||
|
}
|
||||||
|
|
||||||
int pack_factor = 32 / b_q_type->size_bits();
|
int pack_factor = 32 / b_q_type->size_bits();
|
||||||
|
|
||||||
@@ -521,6 +534,9 @@ torch::Tensor marlin_gemm_moe(
|
|||||||
" is not size_n = ", size_n);
|
" is not size_n = ", size_n);
|
||||||
num_groups = b_scales.size(1);
|
num_groups = b_scales.size(1);
|
||||||
|
|
||||||
|
TORCH_CHECK(VLLM_IMPLIES(!is_k_full, has_act_order),
|
||||||
|
"if is_k_full is false, has_act_order must be true");
|
||||||
|
|
||||||
if (has_act_order) {
|
if (has_act_order) {
|
||||||
if (is_k_full) {
|
if (is_k_full) {
|
||||||
TORCH_CHECK(num_groups > 1, "For act_order, num_groups must be > 1");
|
TORCH_CHECK(num_groups > 1, "For act_order, num_groups must be > 1");
|
||||||
@@ -542,13 +558,30 @@ torch::Tensor marlin_gemm_moe(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Verify b_zeros
|
||||||
|
if (has_zp) {
|
||||||
|
int rank = b_zeros.sizes().size();
|
||||||
|
TORCH_CHECK(rank == 3, "b_zeros rank = ", rank, " is not 3");
|
||||||
|
TORCH_CHECK(b_zeros.size(1) == num_groups,
|
||||||
|
"b_zeros dim 1 = ", b_zeros.size(1),
|
||||||
|
" is not num_groups = ", num_groups);
|
||||||
|
TORCH_CHECK(b_zeros.size(2) == size_n / pack_factor,
|
||||||
|
"b_zeros dim 2 = ", b_zeros.size(2),
|
||||||
|
" is not size_n / pack_factor = ", size_n / pack_factor);
|
||||||
|
}
|
||||||
|
|
||||||
marlin_moe::marlin_mm_moe(
|
marlin_moe::marlin_mm_moe(
|
||||||
a.data_ptr(), b_q_weights.data_ptr(), c.data_ptr(), sorted_ids.data_ptr(),
|
a.data_ptr(), b_q_weights.data_ptr(), c.data_ptr(), sorted_ids.data_ptr(),
|
||||||
topk_weights.data_ptr(), topk_ids.data_ptr(), b_scales.data_ptr(),
|
topk_weights.data_ptr(), topk_ids.data_ptr(), b_scales.data_ptr(),
|
||||||
g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr(),
|
b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr(),
|
||||||
expert_offsets.data_ptr(), size_m, size_n, size_k, workspace.data_ptr(),
|
expert_offsets.data_ptr(), size_m, size_n, size_k, workspace.data_ptr(),
|
||||||
*b_q_type, has_act_order, is_k_full, num_groups, group_size, num_experts,
|
*b_q_type, has_act_order, is_k_full, has_zp, num_groups, group_size,
|
||||||
topk, moe_block_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k,
|
num_experts, topk, moe_block_size, dev,
|
||||||
thread_n, sms, max_par, replicate_input, apply_weights);
|
at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms, max_par,
|
||||||
|
replicate_input, apply_weights);
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
||||||
|
m.impl("marlin_gemm_moe", &marlin_gemm_moe);
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,15 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include <torch/all.h>
|
|
||||||
|
|
||||||
#include "core/scalar_type.hpp"
|
|
||||||
|
|
||||||
torch::Tensor marlin_gemm_moe(
|
|
||||||
const torch::Tensor& a, const torch::Tensor& b_q_weights,
|
|
||||||
const torch::Tensor& sorted_ids, const torch::Tensor& topk_weights,
|
|
||||||
const torch::Tensor& topk_ids, const torch::Tensor& b_scales,
|
|
||||||
const torch::Tensor& g_idx, const torch::Tensor& perm,
|
|
||||||
torch::Tensor& workspace, vllm::ScalarTypeTorchPtr const& b_q_type,
|
|
||||||
int64_t size_m, int64_t size_n, int64_t size_k, bool is_k_full,
|
|
||||||
int64_t num_experts, int64_t topk, int64_t moe_block_size,
|
|
||||||
bool replicate_input, bool apply_weights);
|
|
||||||
@@ -1,6 +1,5 @@
|
|||||||
#include "core/registration.h"
|
#include "core/registration.h"
|
||||||
#include "moe_ops.h"
|
#include "moe_ops.h"
|
||||||
#include "marlin_moe_ops.h"
|
|
||||||
|
|
||||||
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
||||||
// Apply topk softmax to the gating outputs.
|
// Apply topk softmax to the gating outputs.
|
||||||
@@ -13,12 +12,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
|||||||
m.def(
|
m.def(
|
||||||
"marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
|
"marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
|
||||||
"Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
|
"Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
|
||||||
"g_idx, Tensor! perm, Tensor! workspace, "
|
"b_zeros, Tensor! g_idx, Tensor! perm, Tensor! workspace, "
|
||||||
"__torch__.torch.classes._core_C.ScalarType b_q_type, int size_m, "
|
"__torch__.torch.classes._core_C.ScalarType b_q_type, int size_m, "
|
||||||
"int size_n, int size_k, bool is_k_full, int num_experts, int topk, "
|
"int size_n, int size_k, bool is_k_full, int num_experts, int topk, "
|
||||||
"int moe_block_size, bool replicate_input, bool apply_weights)"
|
"int moe_block_size, bool replicate_input, bool apply_weights)"
|
||||||
" -> Tensor");
|
" -> Tensor");
|
||||||
m.impl("marlin_gemm_moe", torch::kCUDA, &marlin_gemm_moe);
|
// conditionally compiled so impl registration is in source file
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
99
csrc/ops.h
99
csrc/ops.h
@@ -90,63 +90,8 @@ torch::Tensor awq_dequantize(torch::Tensor _kernel,
|
|||||||
torch::Tensor _zeros, int64_t split_k_iters,
|
torch::Tensor _zeros, int64_t split_k_iters,
|
||||||
int64_t thx, int64_t thy);
|
int64_t thx, int64_t thy);
|
||||||
|
|
||||||
torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
|
||||||
torch::Tensor& b_scales, torch::Tensor& workspace,
|
|
||||||
int64_t size_m, int64_t size_n, int64_t size_k);
|
|
||||||
|
|
||||||
namespace machete {
|
|
||||||
|
|
||||||
std::vector<std::string> supported_schedules(
|
|
||||||
vllm::ScalarTypeTorchPtr const& btype);
|
|
||||||
|
|
||||||
torch::Tensor gemm(torch::Tensor const& A, torch::Tensor const& B,
|
|
||||||
vllm::ScalarTypeTorchPtr const& btype,
|
|
||||||
c10::optional<torch::Tensor> const& scales,
|
|
||||||
c10::optional<torch::Tensor> const& zeros,
|
|
||||||
c10::optional<int64_t> group_size,
|
|
||||||
c10::optional<torch::Tensor> const& C,
|
|
||||||
c10::optional<double> alpha, c10::optional<double> beta,
|
|
||||||
c10::optional<std::string> schedule);
|
|
||||||
|
|
||||||
torch::Tensor prepack_B(torch::Tensor const& B,
|
|
||||||
vllm::ScalarTypeTorchPtr const& btype);
|
|
||||||
|
|
||||||
}; // namespace machete
|
|
||||||
|
|
||||||
torch::Tensor permute_cols(torch::Tensor const& A, torch::Tensor const& perm);
|
torch::Tensor permute_cols(torch::Tensor const& A, torch::Tensor const& perm);
|
||||||
|
|
||||||
torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
|
||||||
torch::Tensor& b_meta,
|
|
||||||
torch::Tensor& b_scales,
|
|
||||||
torch::Tensor& workspace,
|
|
||||||
vllm::ScalarTypeTorchPtr const& b_q_type,
|
|
||||||
int64_t size_m, int64_t size_n,
|
|
||||||
int64_t size_k);
|
|
||||||
|
|
||||||
torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
|
||||||
torch::Tensor& b_scales, torch::Tensor& b_zeros,
|
|
||||||
torch::Tensor& g_idx, torch::Tensor& perm,
|
|
||||||
torch::Tensor& workspace,
|
|
||||||
vllm::ScalarTypeTorchPtr const& b_q_type,
|
|
||||||
int64_t size_m, int64_t size_n, int64_t size_k,
|
|
||||||
bool is_k_full, bool has_zp,
|
|
||||||
bool use_fp32_reduce);
|
|
||||||
|
|
||||||
torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
|
|
||||||
int64_t size_k, int64_t size_n,
|
|
||||||
int64_t num_bits);
|
|
||||||
|
|
||||||
torch::Tensor gptq_marlin_repack_meta(torch::Tensor& b_q_weight,
|
|
||||||
torch::Tensor& perm, c10::SymInt size_k,
|
|
||||||
c10::SymInt size_n, int64_t num_bits);
|
|
||||||
|
|
||||||
torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
|
|
||||||
int64_t size_n, int64_t num_bits);
|
|
||||||
|
|
||||||
torch::Tensor awq_marlin_repack_meta(torch::Tensor& b_q_weight,
|
|
||||||
c10::SymInt size_k, c10::SymInt size_n,
|
|
||||||
int64_t num_bits);
|
|
||||||
|
|
||||||
torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m,
|
torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m,
|
||||||
int64_t n);
|
int64_t n);
|
||||||
|
|
||||||
@@ -156,11 +101,6 @@ torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, torch::Tensor X,
|
|||||||
torch::Tensor ggml_mul_mat_a8(torch::Tensor W, torch::Tensor X, int64_t type,
|
torch::Tensor ggml_mul_mat_a8(torch::Tensor W, torch::Tensor X, int64_t type,
|
||||||
int64_t row);
|
int64_t row);
|
||||||
|
|
||||||
torch::Tensor fp8_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
|
||||||
torch::Tensor& b_scales, torch::Tensor& workspace,
|
|
||||||
int64_t num_bits, int64_t size_m, int64_t size_n,
|
|
||||||
int64_t size_k);
|
|
||||||
|
|
||||||
bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
|
bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
|
||||||
|
|
||||||
void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
|
void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
|
||||||
@@ -175,14 +115,6 @@ void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
|
|||||||
torch::Tensor const& azp_adj,
|
torch::Tensor const& azp_adj,
|
||||||
c10::optional<torch::Tensor> const& azp,
|
c10::optional<torch::Tensor> const& azp,
|
||||||
c10::optional<torch::Tensor> const& bias);
|
c10::optional<torch::Tensor> const& bias);
|
||||||
|
|
||||||
torch::Tensor marlin_qqq_gemm(torch::Tensor const& a,
|
|
||||||
torch::Tensor const& b_q_weight,
|
|
||||||
torch::Tensor const& s_tok,
|
|
||||||
torch::Tensor const& s_ch,
|
|
||||||
torch::Tensor const& s_group,
|
|
||||||
torch::Tensor& workspace, int64_t size_m,
|
|
||||||
int64_t size_n, int64_t size_k);
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
|
void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
|
||||||
@@ -215,25 +147,30 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
|||||||
torch::Tensor experts_ids,
|
torch::Tensor experts_ids,
|
||||||
torch::Tensor num_tokens_post_pad);
|
torch::Tensor num_tokens_post_pad);
|
||||||
|
|
||||||
std::vector<torch::Tensor> selective_scan_fwd(
|
void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta,
|
||||||
const torch::Tensor& u, const torch::Tensor& delta, const torch::Tensor& A,
|
const torch::Tensor& A, const torch::Tensor& B,
|
||||||
const torch::Tensor& B, const torch::Tensor& C,
|
const torch::Tensor& C,
|
||||||
const c10::optional<torch::Tensor>& D_,
|
const c10::optional<torch::Tensor>& D_,
|
||||||
const c10::optional<torch::Tensor>& z_,
|
const c10::optional<torch::Tensor>& z_,
|
||||||
const c10::optional<torch::Tensor>& delta_bias_, bool delta_softplus,
|
const c10::optional<torch::Tensor>& delta_bias_,
|
||||||
const c10::optional<torch::Tensor>& index_,
|
bool delta_softplus,
|
||||||
const c10::optional<torch::Tensor>& x);
|
const c10::optional<torch::Tensor>& query_start_loc,
|
||||||
|
const c10::optional<torch::Tensor>& cache_indices,
|
||||||
|
const c10::optional<torch::Tensor>& has_initial_state,
|
||||||
|
const torch::Tensor& ssm_states);
|
||||||
|
|
||||||
at::Tensor causal_conv1d_update(
|
at::Tensor causal_conv1d_update(
|
||||||
const at::Tensor& x, const at::Tensor& conv_state, const at::Tensor& weight,
|
const at::Tensor& x, const at::Tensor& conv_state, const at::Tensor& weight,
|
||||||
const c10::optional<at::Tensor>& bias, bool silu_activation,
|
const c10::optional<at::Tensor>& bias_, bool silu_activation,
|
||||||
const c10::optional<at::Tensor>& conv_state_indices);
|
const c10::optional<at::Tensor>& cache_seqlens_,
|
||||||
|
const c10::optional<at::Tensor>& conv_state_indices_);
|
||||||
|
|
||||||
at::Tensor causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight,
|
at::Tensor causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight,
|
||||||
const c10::optional<at::Tensor>& bias_,
|
const c10::optional<at::Tensor>& bias_,
|
||||||
const c10::optional<at::Tensor>& seq_idx_,
|
const c10::optional<at::Tensor>& conv_states,
|
||||||
const c10::optional<at::Tensor>& initial_states_,
|
const c10::optional<at::Tensor>& query_start_loc,
|
||||||
const c10::optional<at::Tensor>& final_states_out_,
|
const c10::optional<at::Tensor>& cache_indices,
|
||||||
|
const c10::optional<at::Tensor>& has_initial_state,
|
||||||
bool silu_activation);
|
bool silu_activation);
|
||||||
|
|
||||||
#ifndef USE_ROCM
|
#ifndef USE_ROCM
|
||||||
|
|||||||
@@ -17,6 +17,17 @@ __global__ void advance_step_flashattn_kernel(
|
|||||||
long const* sampled_token_ids_ptr, long* input_positions_ptr,
|
long const* sampled_token_ids_ptr, long* input_positions_ptr,
|
||||||
int* seq_lens_ptr, long* slot_mapping_ptr, int const* block_tables_ptr,
|
int* seq_lens_ptr, long* slot_mapping_ptr, int const* block_tables_ptr,
|
||||||
int64_t const block_tables_stride) {
|
int64_t const block_tables_stride) {
|
||||||
|
int const n_pad = num_seqs - num_queries;
|
||||||
|
if (n_pad && blockIdx.x == 0) {
|
||||||
|
// Handle cuda graph padding
|
||||||
|
int const offset = num_queries;
|
||||||
|
for (int i = threadIdx.x; i < n_pad; i += blockDim.x) {
|
||||||
|
input_tokens_ptr[offset + i] = 0;
|
||||||
|
input_positions_ptr[offset + i] = 0;
|
||||||
|
slot_mapping_ptr[offset + i] = -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int num_query_blocks = div_ceil(num_queries, num_threads);
|
int num_query_blocks = div_ceil(num_queries, num_threads);
|
||||||
|
|
||||||
if (blockIdx.x >= num_query_blocks) {
|
if (blockIdx.x >= num_query_blocks) {
|
||||||
@@ -52,7 +63,7 @@ __global__ void advance_step_flashattn_kernel(
|
|||||||
slot_mapping_ptr[cur_query_id] = slot_num;
|
slot_mapping_ptr[cur_query_id] = slot_num;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void verify_tensor(std::string const& name, torch::Tensor& t,
|
inline void verify_tensor(std::string const& name, torch::Tensor const& t,
|
||||||
int64_t const size_0, int64_t const size_1,
|
int64_t const size_0, int64_t const size_1,
|
||||||
c10::ScalarType const type) {
|
c10::ScalarType const type) {
|
||||||
bool size_0_cond = true;
|
bool size_0_cond = true;
|
||||||
@@ -211,7 +222,7 @@ void advance_step_flashinfer(
|
|||||||
printf(" num_seqs = %d\n", num_seqs);
|
printf(" num_seqs = %d\n", num_seqs);
|
||||||
printf(" num_queries = %d\n", num_queries);
|
printf(" num_queries = %d\n", num_queries);
|
||||||
printf(" block_size = %d\n", block_size);
|
printf(" block_size = %d\n", block_size);
|
||||||
printf(" block_tables.stride(0) = %d\n", block_tables.stride(0));
|
printf(" block_tables.stride(0) = %zu\n", block_tables.stride(0));
|
||||||
}
|
}
|
||||||
// Verify all tensors
|
// Verify all tensors
|
||||||
verify_tensor("input_tokens", input_tokens, num_seqs, -1, at::kLong);
|
verify_tensor("input_tokens", input_tokens, num_seqs, -1, at::kLong);
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ void cutlass_scaled_mm_sm89(torch::Tensor& c, torch::Tensor const& a,
|
|||||||
torch::Tensor const& b_scales,
|
torch::Tensor const& b_scales,
|
||||||
c10::optional<torch::Tensor> const& bias);
|
c10::optional<torch::Tensor> const& bias);
|
||||||
|
|
||||||
#if defined CUDA_VERSION && CUDA_VERSION >= 12000
|
#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
|
||||||
void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
|
void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
|
||||||
torch::Tensor const& b,
|
torch::Tensor const& b,
|
||||||
torch::Tensor const& a_scales,
|
torch::Tensor const& a_scales,
|
||||||
@@ -114,26 +114,39 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
|
|||||||
|
|
||||||
at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
|
at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
|
||||||
int32_t version_num = get_sm_version_num();
|
int32_t version_num = get_sm_version_num();
|
||||||
if (version_num >= 90) {
|
// Hopper
|
||||||
// Hopper
|
|
||||||
|
|
||||||
// Guard against compilation issues for sm90 kernels
|
// Guard against compilation issues for sm90 kernels
|
||||||
#if defined CUDA_VERSION && CUDA_VERSION >= 12000
|
#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
|
||||||
|
if (version_num >= 90) {
|
||||||
cutlass_scaled_mm_sm90(c, a, b, a_scales, b_scales, bias);
|
cutlass_scaled_mm_sm90(c, a, b, a_scales, b_scales, bias);
|
||||||
#else
|
return;
|
||||||
cutlass_scaled_mm_sm80(c, a, b, a_scales, b_scales, bias);
|
}
|
||||||
#endif
|
#endif
|
||||||
} else if (version_num == 89) {
|
|
||||||
|
#if defined ENABLE_SCALED_MM_C2X && ENABLE_SCALED_MM_C2X
|
||||||
|
if (version_num == 89) {
|
||||||
// Ada Lovelace
|
// Ada Lovelace
|
||||||
cutlass_scaled_mm_sm89(c, a, b, a_scales, b_scales, bias);
|
cutlass_scaled_mm_sm89(c, a, b, a_scales, b_scales, bias);
|
||||||
} else if (version_num >= 80) {
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (version_num >= 80) {
|
||||||
// Ampere
|
// Ampere
|
||||||
cutlass_scaled_mm_sm80(c, a, b, a_scales, b_scales, bias);
|
cutlass_scaled_mm_sm80(c, a, b, a_scales, b_scales, bias);
|
||||||
} else {
|
return;
|
||||||
// Turing
|
|
||||||
TORCH_CHECK(version_num >= 75);
|
|
||||||
cutlass_scaled_mm_sm75(c, a, b, a_scales, b_scales, bias);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Turing
|
||||||
|
TORCH_CHECK(version_num >= 75);
|
||||||
|
cutlass_scaled_mm_sm75(c, a, b, a_scales, b_scales, bias);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||||
|
false,
|
||||||
|
"No compiled cutlass_scaled_mm for a compute capability less than "
|
||||||
|
"CUDA device capability: ",
|
||||||
|
version_num);
|
||||||
}
|
}
|
||||||
|
|
||||||
void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
|
void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
|
||||||
@@ -174,25 +187,38 @@ void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
|
|||||||
"currently bias dtype must match output dtype ", c.dtype());
|
"currently bias dtype must match output dtype ", c.dtype());
|
||||||
|
|
||||||
at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
|
at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
|
||||||
int32_t version_num = get_sm_version_num();
|
|
||||||
if (version_num >= 90) {
|
|
||||||
// Hopper
|
|
||||||
|
|
||||||
// Guard against compilation issues for sm90 kernels
|
int32_t version_num = get_sm_version_num();
|
||||||
#if defined CUDA_VERSION && CUDA_VERSION >= 12000
|
|
||||||
|
#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
|
||||||
|
if (version_num >= 90) {
|
||||||
cutlass_scaled_mm_azp_sm90(c, a, b, a_scales, b_scales, azp_adj, azp, bias);
|
cutlass_scaled_mm_azp_sm90(c, a, b, a_scales, b_scales, azp_adj, azp, bias);
|
||||||
#else
|
return;
|
||||||
cutlass_scaled_mm_azp_sm80(c, a, b, a_scales, b_scales, azp_adj, azp, bias);
|
}
|
||||||
#endif
|
#endif
|
||||||
} else if (version_num == 89) {
|
|
||||||
|
#if defined ENABLE_SCALED_MM_C2X && ENABLE_SCALED_MM_C2X
|
||||||
|
if (version_num == 89) {
|
||||||
// Ada Lovelace
|
// Ada Lovelace
|
||||||
cutlass_scaled_mm_azp_sm89(c, a, b, a_scales, b_scales, azp_adj, azp, bias);
|
cutlass_scaled_mm_azp_sm89(c, a, b, a_scales, b_scales, azp_adj, azp, bias);
|
||||||
} else if (version_num >= 80) {
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (version_num >= 80) {
|
||||||
// Ampere
|
// Ampere
|
||||||
cutlass_scaled_mm_azp_sm80(c, a, b, a_scales, b_scales, azp_adj, azp, bias);
|
cutlass_scaled_mm_azp_sm80(c, a, b, a_scales, b_scales, azp_adj, azp, bias);
|
||||||
} else {
|
return;
|
||||||
// Turing
|
|
||||||
TORCH_CHECK(version_num >= 75);
|
|
||||||
cutlass_scaled_mm_azp_sm75(c, a, b, a_scales, b_scales, azp_adj, azp, bias);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Turing
|
||||||
|
TORCH_CHECK(version_num >= 75);
|
||||||
|
cutlass_scaled_mm_azp_sm75(c, a, b, a_scales, b_scales, azp_adj, azp, bias);
|
||||||
|
return;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||||
|
false,
|
||||||
|
"No compiled cutlass_scaled_mm_azp for a compute capability less than "
|
||||||
|
"CUDA device capability: ",
|
||||||
|
version_num);
|
||||||
}
|
}
|
||||||
@@ -22,6 +22,8 @@
|
|||||||
#include "../gptq_marlin/marlin.cuh"
|
#include "../gptq_marlin/marlin.cuh"
|
||||||
#include "../gptq_marlin/marlin_dtypes.cuh"
|
#include "../gptq_marlin/marlin_dtypes.cuh"
|
||||||
|
|
||||||
|
#include "core/registration.h"
|
||||||
|
|
||||||
using namespace marlin;
|
using namespace marlin;
|
||||||
|
|
||||||
#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t) \
|
#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t) \
|
||||||
@@ -1303,3 +1305,7 @@ torch::Tensor fp8_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
|||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
||||||
|
m.impl("fp8_marlin_gemm", &fp8_marlin_gemm);
|
||||||
|
}
|
||||||
@@ -1,25 +1,6 @@
|
|||||||
#include "marlin.cuh"
|
#include "marlin.cuh"
|
||||||
|
|
||||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
|
#include "core/registration.h"
|
||||||
|
|
||||||
namespace marlin {
|
|
||||||
|
|
||||||
template <int const num_threads, int const num_bits, bool const has_perm>
|
|
||||||
__global__ void awq_marlin_repack_kernel(
|
|
||||||
uint32_t const* __restrict__ b_q_weight_ptr, uint32_t* __restrict__ out_ptr,
|
|
||||||
int size_k, int size_n) {}
|
|
||||||
|
|
||||||
} // namespace marlin
|
|
||||||
|
|
||||||
torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
|
|
||||||
int64_t size_k, int64_t size_n,
|
|
||||||
int64_t num_bits) {
|
|
||||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
|
||||||
false, "marlin_repack_from_gptq(..) requires CUDA_ARCH >= 8.0");
|
|
||||||
return torch::empty({1, 1});
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
namespace marlin {
|
namespace marlin {
|
||||||
|
|
||||||
@@ -122,7 +103,7 @@ __global__ void awq_marlin_repack_kernel(
|
|||||||
}
|
}
|
||||||
|
|
||||||
uint32_t vals[8];
|
uint32_t vals[8];
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < 4; i++) {
|
for (int i = 0; i < 4; i++) {
|
||||||
int cur_elem = tc_row + tc_offsets[i];
|
int cur_elem = tc_row + tc_offsets[i];
|
||||||
|
|
||||||
@@ -143,7 +124,7 @@ __global__ void awq_marlin_repack_kernel(
|
|||||||
constexpr int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
|
constexpr int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
|
||||||
|
|
||||||
uint32_t res = 0;
|
uint32_t res = 0;
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < 8; i++) {
|
for (int i = 0; i < 8; i++) {
|
||||||
res |= vals[pack_idx[i]] << (i * 4);
|
res |= vals[pack_idx[i]] << (i * 4);
|
||||||
}
|
}
|
||||||
@@ -155,7 +136,7 @@ __global__ void awq_marlin_repack_kernel(
|
|||||||
|
|
||||||
uint32_t res1 = 0;
|
uint32_t res1 = 0;
|
||||||
uint32_t res2 = 0;
|
uint32_t res2 = 0;
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < 4; i++) {
|
for (int i = 0; i < 4; i++) {
|
||||||
res1 |= vals[pack_idx[i]] << (i * 8);
|
res1 |= vals[pack_idx[i]] << (i * 8);
|
||||||
res2 |= vals[4 + pack_idx[i]] << (i * 8);
|
res2 |= vals[4 + pack_idx[i]] << (i * 8);
|
||||||
@@ -167,21 +148,21 @@ __global__ void awq_marlin_repack_kernel(
|
|||||||
};
|
};
|
||||||
|
|
||||||
auto start_pipes = [&](int k_tile_id, int n_tile_id) {
|
auto start_pipes = [&](int k_tile_id, int n_tile_id) {
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int pipe = 0; pipe < repack_stages - 1; pipe++) {
|
for (int pipe = 0; pipe < repack_stages - 1; pipe++) {
|
||||||
fetch_to_shared(pipe, k_tile_id, n_tile_id + pipe);
|
fetch_to_shared(pipe, k_tile_id, n_tile_id + pipe);
|
||||||
}
|
}
|
||||||
|
|
||||||
wait_for_stage();
|
wait_for_stage();
|
||||||
};
|
};
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int k_tile_id = start_k_tile; k_tile_id < finish_k_tile; k_tile_id++) {
|
for (int k_tile_id = start_k_tile; k_tile_id < finish_k_tile; k_tile_id++) {
|
||||||
int n_tile_id = 0;
|
int n_tile_id = 0;
|
||||||
|
|
||||||
start_pipes(k_tile_id, n_tile_id);
|
start_pipes(k_tile_id, n_tile_id);
|
||||||
|
|
||||||
while (n_tile_id < n_tiles) {
|
while (n_tile_id < n_tiles) {
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int pipe = 0; pipe < repack_stages; pipe++) {
|
for (int pipe = 0; pipe < repack_stages; pipe++) {
|
||||||
fetch_to_shared((pipe + repack_stages - 1) % repack_stages, k_tile_id,
|
fetch_to_shared((pipe + repack_stages - 1) % repack_stages, k_tile_id,
|
||||||
n_tile_id + pipe + repack_stages - 1);
|
n_tile_id + pipe + repack_stages - 1);
|
||||||
@@ -195,15 +176,15 @@ __global__ void awq_marlin_repack_kernel(
|
|||||||
|
|
||||||
} // namespace marlin
|
} // namespace marlin
|
||||||
|
|
||||||
#define CALL_IF(NUM_BITS) \
|
#define CALL_IF(NUM_BITS) \
|
||||||
else if (num_bits == NUM_BITS) { \
|
else if (num_bits == NUM_BITS) { \
|
||||||
cudaFuncSetAttribute( \
|
cudaFuncSetAttribute( \
|
||||||
marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS>, \
|
marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS>, \
|
||||||
cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem); \
|
cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem); \
|
||||||
marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS> \
|
marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS> \
|
||||||
<<<blocks, marlin::repack_threads, max_shared_mem, stream>>>( \
|
<<<blocks, marlin::repack_threads, max_shared_mem, stream>>>( \
|
||||||
b_q_weight_ptr, out_ptr, size_k, size_n); \
|
b_q_weight_ptr, out_ptr, size_k, size_n); \
|
||||||
}
|
}
|
||||||
|
|
||||||
torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
|
torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
|
||||||
int64_t size_n, int64_t num_bits) {
|
int64_t size_n, int64_t num_bits) {
|
||||||
@@ -266,8 +247,6 @@ torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
|
|||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
torch::Tensor awq_marlin_repack_meta(torch::Tensor& b_q_weight,
|
torch::Tensor awq_marlin_repack_meta(torch::Tensor& b_q_weight,
|
||||||
c10::SymInt size_k, c10::SymInt size_n,
|
c10::SymInt size_k, c10::SymInt size_n,
|
||||||
int64_t num_bits) {
|
int64_t num_bits) {
|
||||||
@@ -279,3 +258,11 @@ torch::Tensor awq_marlin_repack_meta(torch::Tensor& b_q_weight,
|
|||||||
{size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
|
{size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
|
||||||
options);
|
options);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
||||||
|
m.impl("awq_marlin_repack", &awq_marlin_repack);
|
||||||
|
}
|
||||||
|
|
||||||
|
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, Meta, m) {
|
||||||
|
m.impl("awq_marlin_repack", &awq_marlin_repack_meta);
|
||||||
|
}
|
||||||
@@ -23,6 +23,8 @@
|
|||||||
#include "marlin_dtypes.cuh"
|
#include "marlin_dtypes.cuh"
|
||||||
#include "core/scalar_type.hpp"
|
#include "core/scalar_type.hpp"
|
||||||
|
|
||||||
|
#include "core/registration.h"
|
||||||
|
|
||||||
#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t) \
|
#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t) \
|
||||||
static_assert(std::is_same<scalar_t, half>::value || \
|
static_assert(std::is_same<scalar_t, half>::value || \
|
||||||
std::is_same<scalar_t, nv_bfloat16>::value, \
|
std::is_same<scalar_t, nv_bfloat16>::value, \
|
||||||
@@ -2258,7 +2260,7 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
|||||||
"b_zeros dim 0 = ", b_zeros.size(0),
|
"b_zeros dim 0 = ", b_zeros.size(0),
|
||||||
" is not num_groups = ", num_groups);
|
" is not num_groups = ", num_groups);
|
||||||
TORCH_CHECK(b_zeros.size(1) == size_n / pack_factor,
|
TORCH_CHECK(b_zeros.size(1) == size_n / pack_factor,
|
||||||
"b_zeros dim 1 = ", b_scales.size(1),
|
"b_zeros dim 1 = ", b_zeros.size(1),
|
||||||
" is not size_n / pack_factor = ", size_n / pack_factor);
|
" is not size_n / pack_factor = ", size_n / pack_factor);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2297,3 +2299,7 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
|||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
||||||
|
m.impl("gptq_marlin_gemm", &gptq_marlin_gemm);
|
||||||
|
}
|
||||||
@@ -1,26 +1,6 @@
|
|||||||
#include "marlin.cuh"
|
#include "marlin.cuh"
|
||||||
|
|
||||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
|
#include "core/registration.h"
|
||||||
|
|
||||||
namespace marlin {
|
|
||||||
|
|
||||||
template <int const num_threads, int const num_bits, bool const has_perm>
|
|
||||||
__global__ void gptq_marlin_repack_kernel(
|
|
||||||
uint32_t const* __restrict__ b_q_weight_ptr,
|
|
||||||
uint32_t const* __restrict__ perm_ptr, uint32_t* __restrict__ out_ptr,
|
|
||||||
int size_k, int size_n) {}
|
|
||||||
|
|
||||||
} // namespace marlin
|
|
||||||
|
|
||||||
torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
|
|
||||||
int64_t size_k, int64_t size_n,
|
|
||||||
int64_t num_bits) {
|
|
||||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
|
||||||
false, "marlin_repack_from_gptq(..) requires CUDA_ARCH >= 8.0");
|
|
||||||
return torch::empty({1, 1});
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
namespace marlin {
|
namespace marlin {
|
||||||
|
|
||||||
@@ -174,13 +154,13 @@ __global__ void gptq_marlin_repack_kernel(
|
|||||||
uint32_t b1_vals[tile_ints];
|
uint32_t b1_vals[tile_ints];
|
||||||
uint32_t b2_vals[tile_ints];
|
uint32_t b2_vals[tile_ints];
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < tile_ints; i++) {
|
for (int i = 0; i < tile_ints; i++) {
|
||||||
b1_vals[i] = sh_stage_int_ptr[cur_n + sh_stride * i];
|
b1_vals[i] = sh_stage_int_ptr[cur_n + sh_stride * i];
|
||||||
b2_vals[i] = sh_stage_int_ptr[cur_n + 8 + sh_stride * i];
|
b2_vals[i] = sh_stage_int_ptr[cur_n + 8 + sh_stride * i];
|
||||||
}
|
}
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < 4; i++) {
|
for (int i = 0; i < 4; i++) {
|
||||||
int cur_elem = tc_row + tc_offsets[i];
|
int cur_elem = tc_row + tc_offsets[i];
|
||||||
int cur_int = cur_elem / pack_factor;
|
int cur_int = cur_elem / pack_factor;
|
||||||
@@ -200,7 +180,7 @@ __global__ void gptq_marlin_repack_kernel(
|
|||||||
constexpr int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
|
constexpr int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
|
||||||
|
|
||||||
uint32_t res = 0;
|
uint32_t res = 0;
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < 8; i++) {
|
for (int i = 0; i < 8; i++) {
|
||||||
res |= vals[pack_idx[i]] << (i * 4);
|
res |= vals[pack_idx[i]] << (i * 4);
|
||||||
}
|
}
|
||||||
@@ -212,7 +192,7 @@ __global__ void gptq_marlin_repack_kernel(
|
|||||||
|
|
||||||
uint32_t res1 = 0;
|
uint32_t res1 = 0;
|
||||||
uint32_t res2 = 0;
|
uint32_t res2 = 0;
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < 4; i++) {
|
for (int i = 0; i < 4; i++) {
|
||||||
res1 |= vals[pack_idx[i]] << (i * 8);
|
res1 |= vals[pack_idx[i]] << (i * 8);
|
||||||
res2 |= vals[4 + pack_idx[i]] << (i * 8);
|
res2 |= vals[4 + pack_idx[i]] << (i * 8);
|
||||||
@@ -224,14 +204,14 @@ __global__ void gptq_marlin_repack_kernel(
|
|||||||
};
|
};
|
||||||
|
|
||||||
auto start_pipes = [&](int k_tile_id, int n_tile_id) {
|
auto start_pipes = [&](int k_tile_id, int n_tile_id) {
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int pipe = 0; pipe < repack_stages - 1; pipe++) {
|
for (int pipe = 0; pipe < repack_stages - 1; pipe++) {
|
||||||
fetch_to_shared(pipe, k_tile_id, n_tile_id + pipe);
|
fetch_to_shared(pipe, k_tile_id, n_tile_id + pipe);
|
||||||
}
|
}
|
||||||
|
|
||||||
wait_for_stage();
|
wait_for_stage();
|
||||||
};
|
};
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int k_tile_id = start_k_tile; k_tile_id < finish_k_tile; k_tile_id++) {
|
for (int k_tile_id = start_k_tile; k_tile_id < finish_k_tile; k_tile_id++) {
|
||||||
int n_tile_id = 0;
|
int n_tile_id = 0;
|
||||||
|
|
||||||
@@ -242,7 +222,7 @@ __global__ void gptq_marlin_repack_kernel(
|
|||||||
start_pipes(k_tile_id, n_tile_id);
|
start_pipes(k_tile_id, n_tile_id);
|
||||||
|
|
||||||
while (n_tile_id < n_tiles) {
|
while (n_tile_id < n_tiles) {
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int pipe = 0; pipe < repack_stages; pipe++) {
|
for (int pipe = 0; pipe < repack_stages; pipe++) {
|
||||||
fetch_to_shared((pipe + repack_stages - 1) % repack_stages, k_tile_id,
|
fetch_to_shared((pipe + repack_stages - 1) % repack_stages, k_tile_id,
|
||||||
n_tile_id + pipe + repack_stages - 1);
|
n_tile_id + pipe + repack_stages - 1);
|
||||||
@@ -256,17 +236,17 @@ __global__ void gptq_marlin_repack_kernel(
|
|||||||
|
|
||||||
} // namespace marlin
|
} // namespace marlin
|
||||||
|
|
||||||
#define CALL_IF(NUM_BITS, HAS_PERM) \
|
#define CALL_IF(NUM_BITS, HAS_PERM) \
|
||||||
else if (num_bits == NUM_BITS && has_perm == HAS_PERM) { \
|
else if (num_bits == NUM_BITS && has_perm == HAS_PERM) { \
|
||||||
cudaFuncSetAttribute( \
|
cudaFuncSetAttribute( \
|
||||||
marlin::gptq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS, \
|
marlin::gptq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS, \
|
||||||
HAS_PERM>, \
|
HAS_PERM>, \
|
||||||
cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem); \
|
cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem); \
|
||||||
marlin::gptq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS, \
|
marlin::gptq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS, \
|
||||||
HAS_PERM> \
|
HAS_PERM> \
|
||||||
<<<blocks, marlin::repack_threads, max_shared_mem, stream>>>( \
|
<<<blocks, marlin::repack_threads, max_shared_mem, stream>>>( \
|
||||||
b_q_weight_ptr, perm_ptr, out_ptr, size_k, size_n); \
|
b_q_weight_ptr, perm_ptr, out_ptr, size_k, size_n); \
|
||||||
}
|
}
|
||||||
|
|
||||||
torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
|
torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
|
||||||
int64_t size_k, int64_t size_n,
|
int64_t size_k, int64_t size_n,
|
||||||
@@ -341,8 +321,6 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
|
|||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
torch::Tensor gptq_marlin_repack_meta(torch::Tensor& b_q_weight,
|
torch::Tensor gptq_marlin_repack_meta(torch::Tensor& b_q_weight,
|
||||||
torch::Tensor& perm, c10::SymInt size_k,
|
torch::Tensor& perm, c10::SymInt size_k,
|
||||||
c10::SymInt size_n, int64_t num_bits) {
|
c10::SymInt size_n, int64_t num_bits) {
|
||||||
@@ -354,3 +332,11 @@ torch::Tensor gptq_marlin_repack_meta(torch::Tensor& b_q_weight,
|
|||||||
{size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
|
{size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
|
||||||
options);
|
options);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
||||||
|
m.impl("gptq_marlin_repack", &gptq_marlin_repack);
|
||||||
|
}
|
||||||
|
|
||||||
|
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, Meta, m) {
|
||||||
|
m.impl("gptq_marlin_repack", &gptq_marlin_repack_meta);
|
||||||
|
}
|
||||||
@@ -284,7 +284,7 @@ mm_impl_template = create_template(IMPL_TEMPLATE)
|
|||||||
prepack_dispatch_template = create_template(PREPACK_TEMPLATE)
|
prepack_dispatch_template = create_template(PREPACK_TEMPLATE)
|
||||||
|
|
||||||
|
|
||||||
def create_sources(impl_config: ImplConfig, num_impl_files=2):
|
def create_sources(impl_config: ImplConfig, num_impl_files=1):
|
||||||
sources = []
|
sources = []
|
||||||
|
|
||||||
type_name = generate_type_signature(impl_config.type_config)
|
type_name = generate_type_signature(impl_config.type_config)
|
||||||
@@ -457,7 +457,13 @@ def generate():
|
|||||||
)),
|
)),
|
||||||
]
|
]
|
||||||
|
|
||||||
schedules = list(set([x[1] for x in default_heuristic]))
|
# Do not use schedules = list(set(...)) because we need to make sure
|
||||||
|
# the output list is deterministic; otherwise the generated kernel file
|
||||||
|
# will be non-deterministic and causes ccache miss.
|
||||||
|
schedules = []
|
||||||
|
for _, schedule_config in default_heuristic:
|
||||||
|
if schedule_config not in schedules:
|
||||||
|
schedules.append(schedule_config)
|
||||||
|
|
||||||
impl_configs = []
|
impl_configs = []
|
||||||
|
|
||||||
|
|||||||
@@ -591,24 +591,27 @@ struct MacheteCollectiveMma {
|
|||||||
tma_load_b = make_tma_copy_B(
|
tma_load_b = make_tma_copy_B(
|
||||||
make_logical_tensor(ptr_B, make_shape(N, K, L), args.dB));
|
make_logical_tensor(ptr_B, make_shape(N, K, L), args.dB));
|
||||||
|
|
||||||
|
int32_t scale_k =
|
||||||
|
(ModeHasScales) ? (K + args.group_size - 1) / args.group_size : 0;
|
||||||
|
int32_t group_size = (ModeHasScales) ? args.group_size : 0;
|
||||||
|
|
||||||
if constexpr (ModeHasScales) {
|
if constexpr (ModeHasScales) {
|
||||||
tma_load_scale = make_tma_copy_scale(make_logical_tensor(
|
tma_load_scale = make_tma_copy_scale(
|
||||||
args.ptr_S, make_shape(M, args.group_size, L), args.dS));
|
make_logical_tensor(args.ptr_S, make_shape(M, scale_k, L), args.dS));
|
||||||
}
|
}
|
||||||
|
|
||||||
if constexpr (KernelConversionMode ==
|
if constexpr (KernelConversionMode ==
|
||||||
ConversionMode::ConvertAndScaleWithZero) {
|
ConversionMode::ConvertAndScaleWithZero) {
|
||||||
tma_load_zero = make_tma_copy_zero(make_logical_tensor(
|
tma_load_zero = make_tma_copy_zero(
|
||||||
args.ptr_Z, make_shape(M, args.group_size, L), args.dS));
|
make_logical_tensor(args.ptr_Z, make_shape(M, scale_k, L), args.dS));
|
||||||
}
|
}
|
||||||
|
|
||||||
if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
|
if constexpr (KernelConversionMode == ConversionMode::DirectConvert ||
|
||||||
return {tma_load_a, tma_load_b, tma_load_scale, tma_load_zero, 0, 0};
|
KernelConversionMode == ConversionMode::ConvertAndScale ||
|
||||||
} else if constexpr (ModeHasScales) {
|
KernelConversionMode ==
|
||||||
auto scale_k = (K + args.group_size - 1) / args.group_size;
|
ConversionMode::ConvertAndScaleWithZero) {
|
||||||
|
|
||||||
return {tma_load_a, tma_load_b, tma_load_scale,
|
return {tma_load_a, tma_load_b, tma_load_scale,
|
||||||
tma_load_zero, scale_k, args.group_size};
|
tma_load_zero, scale_k, group_size};
|
||||||
} else {
|
} else {
|
||||||
static_assert(cutlass::detail::dependent_false<KernelSchedule>,
|
static_assert(cutlass::detail::dependent_false<KernelSchedule>,
|
||||||
"Conversion mode not handled in to_underlying_arguments.");
|
"Conversion mode not handled in to_underlying_arguments.");
|
||||||
|
|||||||
@@ -34,10 +34,9 @@ static __global__ void prepack_B_kernel(BInTensor B_in,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename PrepackedLayoutB, typename InLayout>
|
template <typename PrepackedLayoutB, typename InLayout>
|
||||||
static void prepack_B(cudaStream_t stream,
|
static void prepack_B_template(
|
||||||
typename PrepackedLayoutB::ElementB const* B_in_ptr,
|
cudaStream_t stream, typename PrepackedLayoutB::ElementB const* B_in_ptr,
|
||||||
InLayout B_layout,
|
InLayout B_layout, typename PrepackedLayoutB::ElementB* B_out_ptr) {
|
||||||
typename PrepackedLayoutB::ElementB* B_out_ptr) {
|
|
||||||
using TileShapeNKL =
|
using TileShapeNKL =
|
||||||
decltype(append(typename PrepackedLayoutB::PPBlockShape_NK{}, _1{}));
|
decltype(append(typename PrepackedLayoutB::PPBlockShape_NK{}, _1{}));
|
||||||
auto ilvd_NKbNbKL_to_offset =
|
auto ilvd_NKbNbKL_to_offset =
|
||||||
|
|||||||
@@ -55,8 +55,8 @@ torch::Tensor prepack_impl(torch::Tensor const B) {
|
|||||||
// Allocate output
|
// Allocate output
|
||||||
torch::Tensor D = torch::empty_like(B, {}, at::MemoryFormat::Contiguous);
|
torch::Tensor D = torch::empty_like(B, {}, at::MemoryFormat::Contiguous);
|
||||||
|
|
||||||
prepack_B<PrepackedLayoutB>(stream, B_ptr, layout_Bt,
|
prepack_B_template<PrepackedLayoutB>(
|
||||||
static_cast<ElementB*>(D.mutable_data_ptr()));
|
stream, B_ptr, layout_Bt, static_cast<ElementB*>(D.mutable_data_ptr()));
|
||||||
|
|
||||||
return D;
|
return D;
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -2,6 +2,8 @@
|
|||||||
#include "machete_prepack_launcher.cuh"
|
#include "machete_prepack_launcher.cuh"
|
||||||
#include "core/scalar_type.hpp"
|
#include "core/scalar_type.hpp"
|
||||||
|
|
||||||
|
#include "core/registration.h"
|
||||||
|
|
||||||
namespace machete {
|
namespace machete {
|
||||||
|
|
||||||
using namespace vllm;
|
using namespace vllm;
|
||||||
@@ -78,14 +80,20 @@ torch::Tensor gemm(torch::Tensor const& A, torch::Tensor const& B,
|
|||||||
}
|
}
|
||||||
|
|
||||||
torch::Tensor prepack_B(torch::Tensor const& B,
|
torch::Tensor prepack_B(torch::Tensor const& B,
|
||||||
ScalarTypeTorchPtr const& btype) {
|
vllm::ScalarTypeTorchPtr const& btype) {
|
||||||
#if defined(__CUDACC_VER_MAJOR__) && __CUDACC_VER_MAJOR__ >= 12
|
|
||||||
return scalar_type_dispatch(*btype, [&](auto BType) {
|
return scalar_type_dispatch(*btype, [&](auto BType) {
|
||||||
return PrepackBDispatcher<half_t, decltype(BType), half_t>::dispatch(B);
|
return PrepackBDispatcher<half_t, decltype(BType), half_t>::dispatch(B);
|
||||||
});
|
});
|
||||||
#else
|
}
|
||||||
TORCH_CHECK(false, "Machete requires CUDA 12.0 or later");
|
|
||||||
#endif
|
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
||||||
|
m.impl("machete_prepack_B", &prepack_B);
|
||||||
|
m.impl("machete_gemm", &gemm);
|
||||||
|
}
|
||||||
|
|
||||||
|
// use CatchAll since supported_schedules has no tensor arguments
|
||||||
|
TORCH_LIBRARY_IMPL(TORCH_EXTENSION_NAME, CatchAll, m) {
|
||||||
|
m.impl("machete_supported_schedules", &supported_schedules);
|
||||||
}
|
}
|
||||||
|
|
||||||
}; // namespace machete
|
}; // namespace machete
|
||||||
|
|||||||
@@ -26,6 +26,7 @@
|
|||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
#include "common/base.h"
|
#include "common/base.h"
|
||||||
|
#include "core/registration.h"
|
||||||
|
|
||||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
|
||||||
#include "common/mem.h"
|
#include "common/mem.h"
|
||||||
@@ -1066,3 +1067,7 @@ torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
|||||||
|
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
||||||
|
m.impl("marlin_gemm", &marlin_gemm);
|
||||||
|
}
|
||||||
|
|||||||
@@ -30,6 +30,7 @@
|
|||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
#include "../dense/common/base.h"
|
#include "../dense/common/base.h"
|
||||||
|
#include "core/registration.h"
|
||||||
|
|
||||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
|
||||||
#include "../dense/common/mem.h"
|
#include "../dense/common/mem.h"
|
||||||
@@ -1241,3 +1242,7 @@ torch::Tensor marlin_qqq_gemm(torch::Tensor const& a,
|
|||||||
|
|
||||||
return d;
|
return d;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
||||||
|
m.impl("marlin_qqq_gemm", &marlin_qqq_gemm);
|
||||||
|
}
|
||||||
|
|||||||
@@ -28,6 +28,7 @@
|
|||||||
|
|
||||||
#include "common/base.h"
|
#include "common/base.h"
|
||||||
#include "core/scalar_type.hpp"
|
#include "core/scalar_type.hpp"
|
||||||
|
#include "core/registration.h"
|
||||||
|
|
||||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
|
||||||
|
|
||||||
@@ -1134,3 +1135,7 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
|||||||
|
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
||||||
|
m.impl("gptq_marlin_24_gemm", &gptq_marlin_24_gemm);
|
||||||
|
}
|
||||||
|
|||||||
@@ -167,7 +167,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
ops.def(
|
ops.def(
|
||||||
"marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
|
"marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
|
||||||
"Tensor! workspace, int size_m, int size_n, int size_k) -> Tensor");
|
"Tensor! workspace, int size_m, int size_n, int size_k) -> Tensor");
|
||||||
ops.impl("marlin_gemm", torch::kCUDA, &marlin_gemm);
|
// conditionally compiled so impl in source file
|
||||||
|
|
||||||
// Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ.
|
// Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ.
|
||||||
ops.def(
|
ops.def(
|
||||||
@@ -175,22 +175,24 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
"Tensor b_scales, Tensor workspace, "
|
"Tensor b_scales, Tensor workspace, "
|
||||||
"__torch__.torch.classes._core_C.ScalarType b_q_type, "
|
"__torch__.torch.classes._core_C.ScalarType b_q_type, "
|
||||||
"int size_m, int size_n, int size_k) -> Tensor");
|
"int size_m, int size_n, int size_k) -> Tensor");
|
||||||
ops.impl("gptq_marlin_24_gemm", torch::kCUDA, &gptq_marlin_24_gemm);
|
// conditionally compiled so impl in source file
|
||||||
|
|
||||||
// Machete (Dense) Optimized Mixed Precision GEMM for Hopper.
|
// Machete (Dense) Optimized Mixed Precision GEMM for Hopper.
|
||||||
ops.def("machete_supported_schedules", &machete::supported_schedules);
|
ops.def(
|
||||||
|
"machete_supported_schedules("
|
||||||
|
" __torch__.torch.classes._core_C.ScalarType btype"
|
||||||
|
") -> str[]");
|
||||||
ops.def(
|
ops.def(
|
||||||
"machete_gemm(Tensor A, Tensor B,"
|
"machete_gemm(Tensor A, Tensor B,"
|
||||||
" __torch__.torch.classes._core_C.ScalarType btype,"
|
" __torch__.torch.classes._core_C.ScalarType btype,"
|
||||||
" Tensor? scales, Tensor? zeros, int? group_size,"
|
" Tensor? scales, Tensor? zeros, int? group_size,"
|
||||||
" Tensor? C, float? alpha, float? beta, str? schedule)"
|
" Tensor? C, float? alpha, float? beta, str? schedule)"
|
||||||
"-> Tensor");
|
"-> Tensor");
|
||||||
ops.impl("machete_gemm", torch::kCUDA, &machete::gemm);
|
|
||||||
ops.def(
|
ops.def(
|
||||||
"machete_prepack_B(Tensor B,"
|
"machete_prepack_B(Tensor B,"
|
||||||
" __torch__.torch.classes._core_C.ScalarType btype)"
|
" __torch__.torch.classes._core_C.ScalarType btype)"
|
||||||
"-> Tensor");
|
"-> Tensor");
|
||||||
ops.impl("machete_prepack_B", torch::kCUDA, &machete::prepack_B);
|
// conditionally compiled so impl registration is in source file
|
||||||
|
|
||||||
ops.def("permute_cols(Tensor A, Tensor perm) -> Tensor");
|
ops.def("permute_cols(Tensor A, Tensor perm) -> Tensor");
|
||||||
ops.impl("permute_cols", torch::kCUDA, &permute_cols);
|
ops.impl("permute_cols", torch::kCUDA, &permute_cols);
|
||||||
@@ -202,21 +204,19 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
"__torch__.torch.classes._core_C.ScalarType b_q_type, "
|
"__torch__.torch.classes._core_C.ScalarType b_q_type, "
|
||||||
"int size_m, int size_n, int size_k, bool is_k_full, "
|
"int size_m, int size_n, int size_k, bool is_k_full, "
|
||||||
"bool has_zp, bool use_fp32_reduce) -> Tensor");
|
"bool has_zp, bool use_fp32_reduce) -> Tensor");
|
||||||
ops.impl("gptq_marlin_gemm", torch::kCUDA, &gptq_marlin_gemm);
|
// conditionally compiled so impl registration is in source file
|
||||||
|
|
||||||
// gptq_marlin repack from GPTQ.
|
// gptq_marlin repack from GPTQ.
|
||||||
ops.def(
|
ops.def(
|
||||||
"gptq_marlin_repack(Tensor b_q_weight, Tensor perm, "
|
"gptq_marlin_repack(Tensor b_q_weight, Tensor perm, "
|
||||||
"SymInt size_k, SymInt size_n, int num_bits) -> Tensor");
|
"SymInt size_k, SymInt size_n, int num_bits) -> Tensor");
|
||||||
ops.impl("gptq_marlin_repack", torch::kCUDA, &gptq_marlin_repack);
|
// conditionally compiled so impl registrations are in source file
|
||||||
ops.impl("gptq_marlin_repack", torch::kMeta, &gptq_marlin_repack_meta);
|
|
||||||
|
|
||||||
// awq_marlin repack from AWQ.
|
// awq_marlin repack from AWQ.
|
||||||
ops.def(
|
ops.def(
|
||||||
"awq_marlin_repack(Tensor b_q_weight, SymInt size_k, "
|
"awq_marlin_repack(Tensor b_q_weight, SymInt size_k, "
|
||||||
"SymInt size_n, int num_bits) -> Tensor");
|
"SymInt size_n, int num_bits) -> Tensor");
|
||||||
ops.impl("awq_marlin_repack", torch::kCUDA, &awq_marlin_repack);
|
// conditionally compiled so impl registrations are in source file
|
||||||
ops.impl("awq_marlin_repack", torch::kMeta, &awq_marlin_repack_meta);
|
|
||||||
|
|
||||||
// Dequantization for GGML.
|
// Dequantization for GGML.
|
||||||
ops.def("ggml_dequantize(Tensor W, int type, int m, int n) -> Tensor");
|
ops.def("ggml_dequantize(Tensor W, int type, int m, int n) -> Tensor");
|
||||||
@@ -237,7 +237,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
"fp8_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
|
"fp8_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
|
||||||
"Tensor! workspace, int num_bits, int size_m, int size_n, "
|
"Tensor! workspace, int num_bits, int size_m, int size_n, "
|
||||||
"int size_k) -> Tensor");
|
"int size_k) -> Tensor");
|
||||||
ops.impl("fp8_marlin_gemm", torch::kCUDA, &fp8_marlin_gemm);
|
// conditionally compiled so impl registration is in source file
|
||||||
|
|
||||||
// marlin_qqq_gemm for QQQ.
|
// marlin_qqq_gemm for QQQ.
|
||||||
ops.def(
|
ops.def(
|
||||||
@@ -245,7 +245,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
"Tensor s_tok, Tensor s_ch, Tensor s_group, "
|
"Tensor s_tok, Tensor s_ch, Tensor s_group, "
|
||||||
"Tensor! workspace, int size_m, int size_n, "
|
"Tensor! workspace, int size_m, int size_n, "
|
||||||
"int size_k) -> Tensor");
|
"int size_k) -> Tensor");
|
||||||
ops.impl("marlin_qqq_gemm", torch::kCUDA, &marlin_qqq_gemm);
|
// conditionally compiled so impl registration is in source file
|
||||||
|
|
||||||
// CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
|
// CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
|
||||||
// quantization, as well as bias
|
// quantization, as well as bias
|
||||||
@@ -273,26 +273,31 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
ops.def(
|
ops.def(
|
||||||
"selective_scan_fwd(Tensor! u, Tensor! delta,"
|
"selective_scan_fwd(Tensor! u, Tensor! delta,"
|
||||||
"Tensor! A, Tensor! B, Tensor! C,"
|
"Tensor! A, Tensor! B, Tensor! C,"
|
||||||
"Tensor? D_, Tensor? z_, Tensor? delta_bias_,"
|
"Tensor? D_, Tensor!? z_, Tensor? delta_bias_,"
|
||||||
"bool delta_softplus,"
|
"bool delta_softplus,"
|
||||||
"Tensor? index_, Tensor!? x) -> Tensor[]");
|
"Tensor? query_start_loc,"
|
||||||
|
"Tensor? cache_indices,"
|
||||||
|
"Tensor? has_initial_state,"
|
||||||
|
"Tensor! ssm_states) -> ()");
|
||||||
ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
|
ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
|
||||||
|
|
||||||
ops.def(
|
ops.def(
|
||||||
"causal_conv1d_update(Tensor! x,"
|
"causal_conv1d_update(Tensor! x,"
|
||||||
"Tensor! conv_state,"
|
"Tensor! conv_state,"
|
||||||
"Tensor! weight,"
|
"Tensor! weight,"
|
||||||
"Tensor? bias,"
|
"Tensor? bias_,"
|
||||||
"bool silu_activation,"
|
"bool silu_activation,"
|
||||||
|
"Tensor? cache_seqlens_,"
|
||||||
"Tensor? conv_state_indices) -> Tensor");
|
"Tensor? conv_state_indices) -> Tensor");
|
||||||
ops.impl("causal_conv1d_update", torch::kCUDA, &causal_conv1d_update);
|
ops.impl("causal_conv1d_update", torch::kCUDA, &causal_conv1d_update);
|
||||||
|
|
||||||
ops.def(
|
ops.def(
|
||||||
"causal_conv1d_fwd(Tensor! x, Tensor! weight,"
|
"causal_conv1d_fwd(Tensor! x, Tensor! weight,"
|
||||||
"Tensor? bias_,"
|
"Tensor? bias_,"
|
||||||
"Tensor? seq_idx_,"
|
"Tensor!? conv_states,"
|
||||||
"Tensor? initial_states_,"
|
"Tensor? query_start_loc,"
|
||||||
"Tensor!? final_states_out_,"
|
"Tensor? cache_indices,"
|
||||||
|
"Tensor? has_initial_state,"
|
||||||
"bool silu_activation) -> Tensor");
|
"bool silu_activation) -> Tensor");
|
||||||
ops.impl("causal_conv1d_fwd", torch::kCUDA, &causal_conv1d_fwd);
|
ops.impl("causal_conv1d_fwd", torch::kCUDA, &causal_conv1d_fwd);
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ sphinx-copybutton==0.5.2
|
|||||||
myst-parser==2.0.0
|
myst-parser==2.0.0
|
||||||
sphinx-argparse==0.4.0
|
sphinx-argparse==0.4.0
|
||||||
msgspec
|
msgspec
|
||||||
|
cloudpickle
|
||||||
|
|
||||||
# packages to install to build the documentation
|
# packages to install to build the documentation
|
||||||
pydantic >= 2.8
|
pydantic >= 2.8
|
||||||
@@ -13,3 +14,4 @@ py-cpuinfo
|
|||||||
transformers
|
transformers
|
||||||
mistral_common >= 1.3.4
|
mistral_common >= 1.3.4
|
||||||
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
|
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
|
||||||
|
partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
|
||||||
@@ -8,7 +8,7 @@ Multi-Modality
|
|||||||
vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package.
|
vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package.
|
||||||
|
|
||||||
Multi-modal inputs can be passed alongside text and token prompts to :ref:`supported models <supported_vlms>`
|
Multi-modal inputs can be passed alongside text and token prompts to :ref:`supported models <supported_vlms>`
|
||||||
via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptInputs`.
|
via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptType`.
|
||||||
|
|
||||||
Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities
|
Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities
|
||||||
by following :ref:`this guide <adding_multimodal_plugin>`.
|
by following :ref:`this guide <adding_multimodal_plugin>`.
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
LLM Inputs
|
LLM Inputs
|
||||||
==========
|
==========
|
||||||
|
|
||||||
.. autodata:: vllm.inputs.PromptInputs
|
.. autodata:: vllm.inputs.PromptType
|
||||||
|
|
||||||
.. autoclass:: vllm.inputs.TextPrompt
|
.. autoclass:: vllm.inputs.TextPrompt
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
|||||||
@@ -1,32 +1,53 @@
|
|||||||
.. _debugging:
|
.. _debugging:
|
||||||
|
|
||||||
|
===============
|
||||||
Debugging Tips
|
Debugging Tips
|
||||||
===============
|
===============
|
||||||
|
|
||||||
Debugging hang/crash issues
|
This document outlines some debugging strategies you can consider. If you think you've discovered a bug, please `search existing issues <https://github.com/vllm-project/vllm/issues?q=is%3Aissue>`_ first to see if it has already been reported. If not, please `file a new issue <https://github.com/vllm-project/vllm/issues/new/choose>`_, providing as much relevant information as possible.
|
||||||
---------------------------
|
|
||||||
|
|
||||||
When an vLLM instance hangs or crashes, it is very difficult to debug the issue. But wait a minute, it is also possible that vLLM is doing something that indeed takes a long time:
|
.. note::
|
||||||
|
|
||||||
- **Downloading a model**: Do you have the model already downloaded in your disk? If not, vLLM will download the model from the internet, which can take a long time. Be sure to check the internet connection. It would be better to download the model first using `huggingface-cli <https://huggingface.co/docs/huggingface_hub/en/guides/cli>`_ and then use the local path to the model. This way, you can isolate the issue.
|
Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated.
|
||||||
- **Loading the model from disk**: If the model is large, it can take a long time to load the model from disk. Please take care of the location you store the model. Some clusters have shared filesystems across nodes, e.g. distributed filesystem or network filesystem, which can be slow. It would be better to store the model in a local disk. In addition, please also watch the CPU memory usage. When the model is too large, it might take much CPU memory, which can slow down the operating system because it needs to frequently swap memory between the disk and the memory.
|
|
||||||
- **Tensor parallel inference**: If the model is too large to fit in a single GPU, you might want to use tensor parallelism to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using `the provided script <https://docs.vllm.ai/en/latest/getting_started/examples/save_sharded_state.html>`_ . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
|
|
||||||
|
|
||||||
If you have already taken care of the above issues, but the vLLM instance still hangs, with CPU and GPU utilization at near zero, it is likely that the vLLM instance is stuck somewhere. Here are some tips to help debug the issue:
|
Hangs downloading a model
|
||||||
|
----------------------------------------
|
||||||
|
If the model isn't already downloaded to disk, vLLM will download it from the internet which can take time and depend on your internet connection.
|
||||||
|
It's recommended to download the model first using the `huggingface-cli <https://huggingface.co/docs/huggingface_hub/en/guides/cli>`_ and passing the local path to the model to vLLM. This way, you can isolate the issue.
|
||||||
|
|
||||||
- Set the environment variable ``export VLLM_LOGGING_LEVEL=DEBUG`` to turn on more logging.
|
Hangs loading a model from disk
|
||||||
- Set the environment variable ``export CUDA_LAUNCH_BLOCKING=1`` to know exactly which CUDA kernel is causing the trouble.
|
----------------------------------------
|
||||||
- Set the environment variable ``export NCCL_DEBUG=TRACE`` to turn on more logging for NCCL.
|
If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow.
|
||||||
- Set the environment variable ``export VLLM_TRACE_FUNCTION=1``. All the function calls in vLLM will be recorded. Inspect these log files, and tell which function crashes or hangs.
|
It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory.
|
||||||
|
|
||||||
With more logging, hopefully you can find the root cause of the issue.
|
Model is too large
|
||||||
|
----------------------------------------
|
||||||
|
If the model is too large to fit in a single GPU, you might want to `consider tensor parallelism <https://docs.vllm.ai/en/latest/serving/distributed_serving.html#distributed-inference-and-serving>`_ to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using `this example <https://docs.vllm.ai/en/latest/getting_started/examples/save_sharded_state.html>`_ . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
|
||||||
|
|
||||||
If it crashes, and the error trace shows somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a cuda error inside cudagraph. To know the particular cuda operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the :class:`~vllm.LLM` class, to disable the cudagraph optimization. This way, you can locate the exact cuda operation that causes the error.
|
Enable more logging
|
||||||
|
----------------------------------------
|
||||||
|
If other strategies don't solve the problem, it's likely that the vLLM instance is stuck somewhere. You can use the following environment variables to help debug the issue:
|
||||||
|
|
||||||
Here are some common issues that can cause hangs:
|
- ``export VLLM_LOGGING_LEVEL=DEBUG`` to turn on more logging.
|
||||||
|
- ``export CUDA_LAUNCH_BLOCKING=1`` to identify which CUDA kernel is causing the problem.
|
||||||
|
- ``export NCCL_DEBUG=TRACE`` to turn on more logging for NCCL.
|
||||||
|
- ``export VLLM_TRACE_FUNCTION=1`` to record all function calls for inspection in the log files to tell which function crashes or hangs.
|
||||||
|
|
||||||
- **Incorrect network setup**: The vLLM instance cannot get the correct IP address if you have complicated network config. You can find the log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl``. The IP address should be the correct one. If not, override the IP address by setting the environment variable ``export VLLM_HOST_IP=your_ip_address``. You might also need to set ``export NCCL_SOCKET_IFNAME=your_network_interface`` and ``export GLOO_SOCKET_IFNAME=your_network_interface`` to specify the network interface for the IP address.
|
Incorrect network setup
|
||||||
- **Incorrect hardware/driver**: GPU/CPU communication cannot be established. You can run the following sanity check script to see if the GPU/CPU communication is working correctly.
|
----------------------------------------
|
||||||
|
The vLLM instance cannot get the correct IP address if you have a complicated network config. You can find a log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl`` and the IP address should be the correct one.
|
||||||
|
If it's not, override the IP address using the environment variable ``export VLLM_HOST_IP=<your_ip_address>``.
|
||||||
|
|
||||||
|
You might also need to set ``export NCCL_SOCKET_IFNAME=<your_network_interface>`` and ``export GLOO_SOCKET_IFNAME=<your_network_interface>`` to specify the network interface for the IP address.
|
||||||
|
|
||||||
|
Error near ``self.graph.replay()``
|
||||||
|
----------------------------------------
|
||||||
|
If vLLM crashes and the error trace captures it somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a CUDA error inside CUDAGraph.
|
||||||
|
To identify the particular CUDA operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the :class:`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error.
|
||||||
|
|
||||||
|
Incorrect hardware/driver
|
||||||
|
----------------------------------------
|
||||||
|
If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly.
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
@@ -84,33 +105,29 @@ Here are some common issues that can cause hangs:
|
|||||||
dist.destroy_process_group(gloo_group)
|
dist.destroy_process_group(gloo_group)
|
||||||
dist.destroy_process_group()
|
dist.destroy_process_group()
|
||||||
|
|
||||||
.. tip::
|
If you are testing with a single node, adjust ``--nproc-per-node`` to the number of GPUs you want to use:
|
||||||
|
|
||||||
Save the script as ``test.py``.
|
.. code-block:: shell
|
||||||
|
|
||||||
If you are testing in a single-node, run it with ``NCCL_DEBUG=TRACE torchrun --nproc-per-node=8 test.py``, adjust ``--nproc-per-node`` to the number of GPUs you want to use.
|
NCCL_DEBUG=TRACE torchrun --nproc-per-node=<number-of-GPUs> test.py
|
||||||
|
|
||||||
If you are testing with multi-nodes, run it with ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py``. Adjust ``--nproc-per-node`` and ``--nnodes`` according to your setup. Make sure ``MASTER_ADDR``:
|
If you are testing with multi-nodes, adjust ``--nproc-per-node`` and ``--nnodes`` according to your setup and set ``MASTER_ADDR`` to the correct IP address of the master node, reachable from all nodes. Then, run:
|
||||||
|
|
||||||
- is the correct IP address of the master node
|
.. code-block:: shell
|
||||||
- is reachable from all nodes
|
|
||||||
- is set before running the script.
|
|
||||||
|
|
||||||
If the script runs successfully, you should see the message ``sanity check is successful!``.
|
NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py
|
||||||
|
|
||||||
Note that multi-node environment is more complicated than single-node. If you see errors such as ``torch.distributed.DistNetworkError``, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments:
|
If the script runs successfully, you should see the message ``sanity check is successful!``.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
A multi-node environment is more complicated than a single-node one. If you see errors such as ``torch.distributed.DistNetworkError``, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments:
|
||||||
|
|
||||||
- In the first node, run ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py``.
|
- In the first node, run ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py``.
|
||||||
- In the second node, run ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py``.
|
- In the second node, run ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py``.
|
||||||
|
|
||||||
Adjust ``--nproc-per-node``, ``--nnodes``, and ``--node-rank`` according to your setup. The difference is that you need to execute different commands (with different ``--node-rank``) on different nodes.
|
Adjust ``--nproc-per-node``, ``--nnodes``, and ``--node-rank`` according to your setup, being sure to execute different commands (with different ``--node-rank``) on different nodes.
|
||||||
|
|
||||||
If the problem persists, feel free to `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_, with a detailed description of the issue, your environment, and the logs.
|
Known Issues
|
||||||
|
----------------------------------------
|
||||||
Some known issues:
|
- In ``v0.5.2``, ``v0.5.3``, and ``v0.5.3.post1``, there is a bug caused by `zmq <https://github.com/zeromq/pyzmq/issues/2000>`_ , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of ``vllm`` to include the `fix <https://github.com/vllm-project/vllm/pull/6759>`_.
|
||||||
|
|
||||||
- In ``v0.5.2``, ``v0.5.3``, and ``v0.5.3.post1``, there is a bug caused by `zmq <https://github.com/zeromq/pyzmq/issues/2000>`_ , which can cause hangs at a low probability (once in about 20 times, depending on the machine configuration). The solution is to upgrade to the latest version of ``vllm`` to include the `fix <https://github.com/vllm-project/vllm/pull/6759>`_ .
|
|
||||||
|
|
||||||
.. warning::
|
|
||||||
|
|
||||||
After you find the root cause and solve the issue, remember to turn off all the debugging environment variables defined above, or simply start a new shell to avoid being affected by the debugging settings. If you don't do this, the system might be slow because many debugging functionalities are turned on.
|
|
||||||
|
|||||||
@@ -1,19 +1,20 @@
|
|||||||
.. _installation:
|
.. _installation:
|
||||||
|
|
||||||
|
============
|
||||||
Installation
|
Installation
|
||||||
============
|
============
|
||||||
|
|
||||||
vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries.
|
vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries.
|
||||||
|
|
||||||
Requirements
|
Requirements
|
||||||
------------
|
===========================
|
||||||
|
|
||||||
* OS: Linux
|
* OS: Linux
|
||||||
* Python: 3.8 -- 3.12
|
* Python: 3.8 -- 3.12
|
||||||
* GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
|
* GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
|
||||||
|
|
||||||
Install with pip
|
Install released versions
|
||||||
----------------
|
===========================
|
||||||
|
|
||||||
You can install vLLM using pip:
|
You can install vLLM using pip:
|
||||||
|
|
||||||
@@ -46,98 +47,173 @@ You can install vLLM using pip:
|
|||||||
|
|
||||||
Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions.
|
Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions.
|
||||||
|
|
||||||
.. note::
|
|
||||||
|
|
||||||
vLLM also publishes a subset of wheels (Python 3.10, 3.11 with CUDA 12) for every commit since v0.5.3. You can download them with the following command:
|
.. _install-the-latest-code:
|
||||||
|
|
||||||
.. code-block:: console
|
Install the latest code
|
||||||
|
=========================
|
||||||
|
|
||||||
$ export VLLM_VERSION=0.6.1.post1 # vLLM's main branch version is currently set to latest released tag
|
LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on x86 platform with cuda 12 for every commit since v0.5.3. You can download and install the latest one with the following command:
|
||||||
$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-${VLLM_VERSION}-cp38-abi3-manylinux1_x86_64.whl
|
|
||||||
$ # You can also access a specific commit
|
|
||||||
$ # export VLLM_COMMIT=...
|
|
||||||
$ # pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-${VLLM_VERSION}-cp38-abi3-manylinux1_x86_64.whl
|
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
|
||||||
|
|
||||||
|
If you want to access the wheels for previous commits, you can specify the commit hash in the URL:
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
|
||||||
|
$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
|
||||||
|
|
||||||
|
Note that the wheels are built with Python 3.8 abi (see `PEP 425 <https://peps.python.org/pep-0425/>`_ for more details about abi), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (``1.0.0.dev``) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata.
|
||||||
|
|
||||||
|
Another way to access the latest code is to use the docker images:
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
|
||||||
|
$ docker pull public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:${VLLM_COMMIT}
|
||||||
|
|
||||||
|
These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days.
|
||||||
|
|
||||||
|
Latest code can contain bugs and may not be stable. Please use it with caution.
|
||||||
|
|
||||||
.. _build_from_source:
|
.. _build_from_source:
|
||||||
|
|
||||||
Build from source
|
Build from source
|
||||||
-----------------
|
==================
|
||||||
|
|
||||||
You can also build and install vLLM from source:
|
.. _python-only-build:
|
||||||
|
|
||||||
|
Python-only build (without compilation)
|
||||||
|
----------------------------------------
|
||||||
|
|
||||||
|
If you only need to change Python code, you can simply build vLLM without compilation.
|
||||||
|
|
||||||
|
The first step is to install the latest vLLM wheel:
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
|
||||||
|
|
||||||
|
You can find more information about vLLM's wheels `above <#install-the-latest-code>`_.
|
||||||
|
|
||||||
|
After verifying that the installation is successful, you can use `the following script <https://github.com/vllm-project/vllm/blob/main/python_only_dev.py>`_:
|
||||||
|
|
||||||
.. code-block:: console
|
.. code-block:: console
|
||||||
|
|
||||||
$ git clone https://github.com/vllm-project/vllm.git
|
$ git clone https://github.com/vllm-project/vllm.git
|
||||||
$ cd vllm
|
$ cd vllm
|
||||||
$ pip install -e . # This may take 5-10 minutes.
|
$ python python_only_dev.py
|
||||||
|
|
||||||
|
The script will:
|
||||||
|
|
||||||
|
* Find the installed vLLM package in the current environment.
|
||||||
|
* Copy built files to the current directory.
|
||||||
|
* Rename the installed vLLM package.
|
||||||
|
* Symbolically link the current directory to the installed vLLM package.
|
||||||
|
|
||||||
|
Now, you can edit the Python code in the current directory, and the changes will be reflected when you run vLLM.
|
||||||
|
|
||||||
|
Once you have finished editing or want to install another vLLM wheel, you should exit the development environment using `the same script <https://github.com/vllm-project/vllm/blob/main/python_only_dev.py>`_ with the ``--quit-dev``(or ``-q`` for short) flag:
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
$ python python_only_dev.py --quit-dev
|
||||||
|
|
||||||
|
The script with ``--quit-dev`` flag will:
|
||||||
|
|
||||||
|
* Remove the symbolic link from the current directory to the vLLM package.
|
||||||
|
* Restore the original vLLM package from the backup.
|
||||||
|
|
||||||
|
If you update the vLLM wheel and want to rebuild from the source and make further edits, you will need to start `all above <#python-only-build>`_ over again.
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
This will uninstall existing PyTorch, and install the version required by vLLM. If you want to use an existing PyTorch installation, there need to be some changes:
|
There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
|
||||||
|
It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to `the above section <#install-the-latest-code>`_ for instructions on how to install a specified wheel.
|
||||||
|
|
||||||
.. code-block:: console
|
Full build (with compilation)
|
||||||
|
---------------------------------
|
||||||
|
|
||||||
$ git clone https://github.com/vllm-project/vllm.git
|
If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes:
|
||||||
$ cd vllm
|
|
||||||
$ python use_existing_torch.py
|
|
||||||
$ pip install -r requirements-build.txt
|
|
||||||
$ pip install -e . --no-build-isolation
|
|
||||||
|
|
||||||
The differences are:
|
.. code-block:: console
|
||||||
|
|
||||||
- ``python use_existing_torch.py``: This script will remove all the PyTorch versions in the requirements files, so that the existing PyTorch installation will be used.
|
|
||||||
- ``pip install -r requirements-build.txt``: You need to manually install the requirements for building vLLM.
|
|
||||||
- ``pip install -e . --no-build-isolation``: You need to disable build isolation, so that the build system can use the existing PyTorch installation.
|
|
||||||
|
|
||||||
This is especially useful when the PyTorch dependency cannot be easily installed via pip, e.g.:
|
|
||||||
|
|
||||||
- build vLLM with PyTorch nightly or a custom PyTorch build.
|
|
||||||
- build vLLM with aarch64 and cuda (GH200), where the PyTorch wheels are not available on PyPI. Currently, only PyTorch nightly has wheels for aarch64 with CUDA. You can run ``pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124`` to install PyTorch nightly, and then build vLLM on top of it.
|
|
||||||
|
|
||||||
.. note::
|
|
||||||
|
|
||||||
vLLM can fully run only on Linux, but you can still build it on other systems (for example, macOS). This build is only for development purposes, allowing for imports and a more convenient dev environment. The binaries will not be compiled and not work on non-Linux systems. You can create such a build with the following commands:
|
|
||||||
|
|
||||||
.. code-block:: console
|
|
||||||
|
|
||||||
$ export VLLM_TARGET_DEVICE=empty
|
|
||||||
$ pip install -e .
|
|
||||||
|
|
||||||
|
$ git clone https://github.com/vllm-project/vllm.git
|
||||||
|
$ cd vllm
|
||||||
|
$ pip install -e .
|
||||||
|
|
||||||
.. tip::
|
.. tip::
|
||||||
|
|
||||||
Building from source requires quite a lot compilation. If you are building from source for multiple times, it is beneficial to cache the compilation results. For example, you can install `ccache <https://github.com/ccache/ccache>`_ via either ``conda install ccache`` or ``apt install ccache`` . As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, the subsequent builds will be much faster.
|
Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results.
|
||||||
|
For example, you can install `ccache <https://github.com/ccache/ccache>`_ using ``conda install ccache`` or ``apt install ccache`` .
|
||||||
|
As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster.
|
||||||
|
|
||||||
.. tip::
|
|
||||||
To avoid your system being overloaded, you can limit the number of compilation jobs
|
|
||||||
to be run simultaneously, via the environment variable ``MAX_JOBS``. For example:
|
|
||||||
|
|
||||||
.. code-block:: console
|
Use an existing PyTorch installation
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
There are scenarios where the PyTorch dependency cannot be easily installed via pip, e.g.:
|
||||||
|
|
||||||
$ export MAX_JOBS=6
|
* Building vLLM with PyTorch nightly or a custom PyTorch build.
|
||||||
$ pip install -e .
|
* Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run ``pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124`` to `install PyTorch nightly <https://pytorch.org/get-started/locally/>`_, and then build vLLM on top of it.
|
||||||
|
|
||||||
This is especially useful when you are building on less powerful machines. For example, when you use WSL, it only `gives you half of the memory by default <https://learn.microsoft.com/en-us/windows/wsl/wsl-config>`_, and you'd better use ``export MAX_JOBS=1`` to avoid compiling multiple files simultaneously and running out of memory. The side effect is that the build process will be much slower. If you only touch the Python code, slow compilation is okay, as you are building in an editable mode: you can just change the code and run the Python script without any re-compilation or re-installation.
|
To build vLLM using an existing PyTorch installation:
|
||||||
|
|
||||||
.. tip::
|
.. code-block:: console
|
||||||
If you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image.
|
|
||||||
|
|
||||||
.. code-block:: console
|
$ git clone https://github.com/vllm-project/vllm.git
|
||||||
|
$ cd vllm
|
||||||
|
$ python use_existing_torch.py
|
||||||
|
$ pip install -r requirements-build.txt
|
||||||
|
$ pip install -e . --no-build-isolation
|
||||||
|
|
||||||
$ # Use `--ipc=host` to make sure the shared memory is large enough.
|
|
||||||
$ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3
|
|
||||||
|
|
||||||
If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from `the official website <https://developer.nvidia.com/cuda-toolkit-archive>`_. After installation, set the environment variable ``CUDA_HOME`` to the installation path of CUDA Toolkit, and make sure that the ``nvcc`` compiler is in your ``PATH``, e.g.:
|
Troubleshooting
|
||||||
|
~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. code-block:: console
|
To avoid your system being overloaded, you can limit the number of compilation jobs
|
||||||
|
to be run simultaneously, via the environment variable ``MAX_JOBS``. For example:
|
||||||
|
|
||||||
$ export CUDA_HOME=/usr/local/cuda
|
.. code-block:: console
|
||||||
$ export PATH="${CUDA_HOME}/bin:$PATH"
|
|
||||||
|
|
||||||
Here is a sanity check to verify that the CUDA Toolkit is correctly installed:
|
$ export MAX_JOBS=6
|
||||||
|
$ pip install -e .
|
||||||
|
|
||||||
.. code-block:: console
|
This is especially useful when you are building on less powerful machines. For example, when you use WSL it only `assigns 50% of the total memory by default <https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings>`_, so using ``export MAX_JOBS=1`` can avoid compiling multiple files simultaneously and running out of memory.
|
||||||
|
A side effect is a much slower build process.
|
||||||
|
|
||||||
$ nvcc --version # verify that nvcc is in your PATH
|
Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image.
|
||||||
$ ${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
$ # Use `--ipc=host` to make sure the shared memory is large enough.
|
||||||
|
$ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3
|
||||||
|
|
||||||
|
If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from `the official website <https://developer.nvidia.com/cuda-toolkit-archive>`_. After installation, set the environment variable ``CUDA_HOME`` to the installation path of CUDA Toolkit, and make sure that the ``nvcc`` compiler is in your ``PATH``, e.g.:
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
$ export CUDA_HOME=/usr/local/cuda
|
||||||
|
$ export PATH="${CUDA_HOME}/bin:$PATH"
|
||||||
|
|
||||||
|
Here is a sanity check to verify that the CUDA Toolkit is correctly installed:
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
$ nvcc --version # verify that nvcc is in your PATH
|
||||||
|
$ ${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME
|
||||||
|
|
||||||
|
|
||||||
|
Unsupported OS build
|
||||||
|
----------------------
|
||||||
|
|
||||||
|
vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems.
|
||||||
|
|
||||||
|
Simply disable the ``VLLM_TARGET_DEVICE`` environment variable before installing:
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
$ export VLLM_TARGET_DEVICE=empty
|
||||||
|
$ pip install -e .
|
||||||
|
|||||||
@@ -27,6 +27,10 @@ Installation steps:
|
|||||||
|
|
||||||
.. _build_from_source_neuron:
|
.. _build_from_source_neuron:
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with vLLM >= 0.5.3. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel.
|
||||||
|
|
||||||
Build from source
|
Build from source
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
Installation with OpenVINO
|
Installation with OpenVINO
|
||||||
==========================
|
==========================
|
||||||
|
|
||||||
vLLM powered by OpenVINO supports all LLM models from :doc:`vLLM supported models list <../models/supported_models>` and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support. OpenVINO vLLM backend supports the following advanced vLLM features:
|
vLLM powered by OpenVINO supports all LLM models from :doc:`vLLM supported models list <../models/supported_models>` and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs (`the list of supported GPUs <https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu>`_). OpenVINO vLLM backend supports the following advanced vLLM features:
|
||||||
|
|
||||||
- Prefix caching (``--enable-prefix-caching``)
|
- Prefix caching (``--enable-prefix-caching``)
|
||||||
- Chunked prefill (``--enable-chunked-prefill``)
|
- Chunked prefill (``--enable-chunked-prefill``)
|
||||||
@@ -59,28 +59,51 @@ Install from source
|
|||||||
|
|
||||||
$ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
|
$ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
|
||||||
|
|
||||||
|
- [Optional] To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: `https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html <https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html>`_.
|
||||||
|
|
||||||
.. _openvino_backend_performance_tips:
|
.. _openvino_backend_performance_tips:
|
||||||
|
|
||||||
Performance tips
|
Performance tips
|
||||||
----------------
|
----------------
|
||||||
|
|
||||||
vLLM OpenVINO backend uses the following environment variables to control behavior:
|
vLLM OpenVINO backend environment variables
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
- ``VLLM_OPENVINO_DEVICE`` to specify which device utilize for the inference. If there are multiple GPUs in the system, additional indexes can be used to choose the proper one (e.g, ``VLLM_OPENVINO_DEVICE=GPU.1``). If the value is not specified, CPU device is used by default.
|
||||||
|
|
||||||
|
- ``VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON`` to enable U8 weights compression during model loading stage. By default, compression is turned off. You can also export model with different compression techniques using `optimum-cli` and pass exported folder as `<model_id>`
|
||||||
|
|
||||||
|
CPU performance tips
|
||||||
|
~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
CPU uses the following environment variables to control behavior:
|
||||||
|
|
||||||
- ``VLLM_OPENVINO_KVCACHE_SPACE`` to specify the KV Cache size (e.g, ``VLLM_OPENVINO_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
|
- ``VLLM_OPENVINO_KVCACHE_SPACE`` to specify the KV Cache size (e.g, ``VLLM_OPENVINO_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
|
||||||
|
|
||||||
- ``VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8`` to control KV cache precision. By default, FP16 / BF16 is used depending on platform.
|
- ``VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8`` to control KV cache precision. By default, FP16 / BF16 is used depending on platform.
|
||||||
|
|
||||||
- ``VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON`` to enable U8 weights compression during model loading stage. By default, compression is turned off. You can also export model with different compression techniques using `optimum-cli` and pass exported folder as `<model_id>`
|
|
||||||
|
|
||||||
To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (``--enable-chunked-prefill``). Based on the experiments, the recommended batch size is ``256`` (``--max-num-batched-tokens``)
|
To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (``--enable-chunked-prefill``). Based on the experiments, the recommended batch size is ``256`` (``--max-num-batched-tokens``)
|
||||||
|
|
||||||
OpenVINO best known configuration is:
|
OpenVINO best known configuration for CPU is:
|
||||||
|
|
||||||
.. code-block:: console
|
.. code-block:: console
|
||||||
|
|
||||||
$ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
|
$ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256
|
python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256
|
||||||
|
|
||||||
|
GPU performance tips
|
||||||
|
~~~~~~~~~~~~~~~~~~~~
|
||||||
|
GPU device implements the logic for automatic detection of available GPU memory and, by default, tries to reserve as much memory as possible for the KV cache (taking into account ``gpu_memory_utilization`` option). However, this behavior can be overridden by explicitly specifying the desired amount of memory for the KV cache using ``VLLM_OPENVINO_KVCACHE_SPACE`` environment variable (e.g, ``VLLM_OPENVINO_KVCACHE_SPACE=8`` means 8 GB space for KV cache).
|
||||||
|
|
||||||
|
Currently, the best performance using GPU can be achieved with the default vLLM execution parameters for models with quantized weights (8 and 4-bit integer data types are supported) and `preemption-mode=swap`.
|
||||||
|
|
||||||
|
OpenVINO best known configuration for GPU is:
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
$ VLLM_OPENVINO_DEVICE=GPU VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
|
||||||
|
python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
|
||||||
.. _openvino_backend_limitations:
|
.. _openvino_backend_limitations:
|
||||||
|
|
||||||
Limitations
|
Limitations
|
||||||
|
|||||||
@@ -79,12 +79,14 @@ Documentation
|
|||||||
|
|
||||||
serving/openai_compatible_server
|
serving/openai_compatible_server
|
||||||
serving/deploying_with_docker
|
serving/deploying_with_docker
|
||||||
|
serving/deploying_with_k8s
|
||||||
serving/distributed_serving
|
serving/distributed_serving
|
||||||
serving/metrics
|
serving/metrics
|
||||||
serving/env_vars
|
serving/env_vars
|
||||||
serving/usage_stats
|
serving/usage_stats
|
||||||
serving/integrations
|
serving/integrations
|
||||||
serving/tensorizer
|
serving/tensorizer
|
||||||
|
serving/compatibility_matrix
|
||||||
serving/faq
|
serving/faq
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
|
|||||||
@@ -85,21 +85,21 @@ When it comes to the linear layers, we provide the following options to parallel
|
|||||||
* :code:`ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving.
|
* :code:`ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving.
|
||||||
* :code:`RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer.
|
* :code:`RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer.
|
||||||
* :code:`ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer.
|
* :code:`ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer.
|
||||||
* :code:`MergedColumnParallelLinear`: Column-parallel linear that merges multiple `ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices.
|
* :code:`MergedColumnParallelLinear`: Column-parallel linear that merges multiple :code:`ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices.
|
||||||
* :code:`QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices.
|
* :code:`QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices.
|
||||||
|
|
||||||
Note that all the linear layers above take `linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization.
|
Note that all the linear layers above take :code:`linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization.
|
||||||
|
|
||||||
4. Implement the weight loading logic
|
4. Implement the weight loading logic
|
||||||
-------------------------------------
|
-------------------------------------
|
||||||
|
|
||||||
You now need to implement the :code:`load_weights` method in your :code:`*ForCausalLM` class.
|
You now need to implement the :code:`load_weights` method in your :code:`*ForCausalLM` class.
|
||||||
This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for `MergedColumnParallelLinear` and `QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately.
|
This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for :code:`MergedColumnParallelLinear` and :code:`QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately.
|
||||||
|
|
||||||
5. Register your model
|
5. Register your model
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
Finally, register your :code:`*ForCausalLM` class to the :code:`_MODELS` in `vllm/model_executor/models/__init__.py <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/__init__.py>`_.
|
Finally, register your :code:`*ForCausalLM` class to the :code:`_VLLM_MODELS` in `vllm/model_executor/models/registry.py <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/registry.py>`_.
|
||||||
|
|
||||||
6. Out-of-Tree Model Integration
|
6. Out-of-Tree Model Integration
|
||||||
--------------------------------------------
|
--------------------------------------------
|
||||||
@@ -114,6 +114,18 @@ Just add the following lines in your code:
|
|||||||
from your_code import YourModelForCausalLM
|
from your_code import YourModelForCausalLM
|
||||||
ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
|
ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
|
||||||
|
|
||||||
|
If your model imports modules that initialize CUDA, consider instead lazy-importing it to avoid an error like :code:`RuntimeError: Cannot re-initialize CUDA in forked subprocess`:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from vllm import ModelRegistry
|
||||||
|
|
||||||
|
ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM")
|
||||||
|
|
||||||
|
.. important::
|
||||||
|
If your model is a multimodal model, make sure the model class implements the :class:`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
|
||||||
|
Read more about that :ref:`here <enabling_multimodal_inputs>`.
|
||||||
|
|
||||||
If you are running api server with :code:`vllm serve <args>`, you can wrap the entrypoint with the following code:
|
If you are running api server with :code:`vllm serve <args>`, you can wrap the entrypoint with the following code:
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|||||||
@@ -22,6 +22,8 @@ If you frequently encounter preemptions from the vLLM engine, consider the follo
|
|||||||
|
|
||||||
You can also monitor the number of preemption requests through Prometheus metrics exposed by the vLLM. Additionally, you can log the cumulative number of preemption requests by setting disable_log_stats=False.
|
You can also monitor the number of preemption requests through Prometheus metrics exposed by the vLLM. Additionally, you can log the cumulative number of preemption requests by setting disable_log_stats=False.
|
||||||
|
|
||||||
|
.. _chunked-prefill:
|
||||||
|
|
||||||
Chunked Prefill
|
Chunked Prefill
|
||||||
---------------
|
---------------
|
||||||
vLLM supports an experimental feature chunked prefill. Chunked prefill allows to chunk large prefills into smaller chunks and batch them together with decode requests.
|
vLLM supports an experimental feature chunked prefill. Chunked prefill allows to chunk large prefills into smaller chunks and batch them together with decode requests.
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user