Compare commits
17 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
72506c9834 | ||
|
|
b2eb84de77 | ||
|
|
ac43367ced | ||
|
|
30fe765e9f | ||
|
|
2c0ee0fde8 | ||
|
|
55f1fc1b1b | ||
|
|
17f3988094 | ||
|
|
682c38583c | ||
|
|
f124b56786 | ||
|
|
d78e128b8b | ||
|
|
761b730dcb | ||
|
|
f34eca5f01 | ||
|
|
4cd332f3cf | ||
|
|
16484d394c | ||
|
|
e397bd6592 | ||
|
|
6a88d590bb | ||
|
|
ad8c073131 |
@@ -2,7 +2,7 @@
|
|||||||
# We can use this script to compute baseline accuracy on chartqa for vllm.
|
# We can use this script to compute baseline accuracy on chartqa for vllm.
|
||||||
#
|
#
|
||||||
# Make sure you have lm-eval-harness installed:
|
# Make sure you have lm-eval-harness installed:
|
||||||
# pip install "lm-eval[api]>=0.4.9.2"
|
# pip install lm-eval==0.4.9
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo``
|
echo``
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
# We can use this script to compute baseline accuracy on GSM for transformers.
|
# We can use this script to compute baseline accuracy on GSM for transformers.
|
||||||
#
|
#
|
||||||
# Make sure you have lm-eval-harness installed:
|
# Make sure you have lm-eval-harness installed:
|
||||||
# pip install "lm-eval[api]>=0.4.9.2"
|
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo``
|
echo``
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
# We use this for fp8, which HF does not support.
|
# We use this for fp8, which HF does not support.
|
||||||
#
|
#
|
||||||
# Make sure you have lm-eval-harness installed:
|
# Make sure you have lm-eval-harness installed:
|
||||||
# pip install "lm-eval[api]>=0.4.9.2"
|
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo``
|
echo``
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
# We use this for fp8, which HF does not support.
|
# We use this for fp8, which HF does not support.
|
||||||
#
|
#
|
||||||
# Make sure you have lm-eval-harness installed:
|
# Make sure you have lm-eval-harness installed:
|
||||||
# pip install "lm-eval[api]>=0.4.9.2"
|
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo``
|
echo``
|
||||||
|
|||||||
@@ -60,7 +60,6 @@ def launch_lm_eval(eval_config, tp_size):
|
|||||||
f"add_bos_token=true,"
|
f"add_bos_token=true,"
|
||||||
f"trust_remote_code={trust_remote_code},"
|
f"trust_remote_code={trust_remote_code},"
|
||||||
f"max_model_len={max_model_len},"
|
f"max_model_len={max_model_len},"
|
||||||
"allow_deprecated_quantization=True,"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
env_vars = eval_config.get("env_vars", None)
|
env_vars = eval_config.get("env_vars", None)
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](http
|
|||||||
|
|
||||||
## Performance benchmark quick overview
|
## Performance benchmark quick overview
|
||||||
|
|
||||||
**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors, Intel® Gaudi® 3 Accelerators and Arm® Neoverse™ with different models.
|
**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors and Intel® Gaudi® 3 Accelerators with different models.
|
||||||
|
|
||||||
**Benchmarking Duration**: about 1hr.
|
**Benchmarking Duration**: about 1hr.
|
||||||
|
|
||||||
@@ -23,7 +23,7 @@ bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
|
|||||||
|
|
||||||
Runtime environment variables:
|
Runtime environment variables:
|
||||||
|
|
||||||
- `ON_CPU`: set the value to '1' on Intel® Xeon® and Arm® Neoverse™ Processors. Default value is 0.
|
- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
|
||||||
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
|
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
|
||||||
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
|
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
|
||||||
- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
|
- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
|
||||||
@@ -34,9 +34,8 @@ Runtime environment variables:
|
|||||||
|
|
||||||
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
||||||
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
|
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
|
||||||
> For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
|
For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
|
||||||
> For Arm® Neoverse™, use `tests/latency-tests-arm64-cpu.json`, `tests/throughput-tests-arm64-cpu.json`, `tests/serving-tests-arm64-cpu.json` instead.
|
>
|
||||||
|
|
||||||
### Latency test
|
### Latency test
|
||||||
|
|
||||||
Here is an example of one test inside `latency-tests.json`:
|
Here is an example of one test inside `latency-tests.json`:
|
||||||
@@ -176,6 +175,19 @@ If you do not see the table, please wait till the benchmark finish running.
|
|||||||
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
||||||
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
|
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
|
||||||
|
|
||||||
#### Performance Results Comparison
|
The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
|
||||||
|
When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
|
||||||
|
`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
|
||||||
|
If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
|
||||||
|
|
||||||
Follow the instructions in [performance results comparison](https://docs.vllm.ai/en/latest/benchmarking/dashboard/#performance-results-comparison) to analyze performance results and the sizing guide.
|
Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps.
|
||||||
|
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
|
||||||
|
|
||||||
|
| | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
|
||||||
|
|----|---------------------------------------|--------|-----|-----|------|-----|-----------|----------|----------|
|
||||||
|
| 0 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | 1 | 142.633982 | 156.526018 | 1.097396 |
|
||||||
|
| 1 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | inf| 241.620334 | 294.018783 | 1.216863 |
|
||||||
|
|
||||||
|
A comparison diagram will be generated below the table.
|
||||||
|
Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
|
||||||
|
<img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
|
||||||
|
|||||||
@@ -1,13 +1,8 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import html as _html
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
from dataclasses import dataclass
|
|
||||||
from importlib import util
|
from importlib import util
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@@ -15,49 +10,27 @@ import pandas as pd
|
|||||||
pd.options.display.float_format = "{:.2f}".format
|
pd.options.display.float_format = "{:.2f}".format
|
||||||
plotly_found = util.find_spec("plotly.express") is not None
|
plotly_found = util.find_spec("plotly.express") is not None
|
||||||
|
|
||||||
DEFAULT_INFO_COLS = [
|
|
||||||
"Model",
|
|
||||||
"Dataset Name",
|
|
||||||
"Input Len",
|
|
||||||
"Output Len",
|
|
||||||
# "TP Size",
|
|
||||||
# "PP Size",
|
|
||||||
"# of max concurrency.",
|
|
||||||
"qps",
|
|
||||||
]
|
|
||||||
|
|
||||||
# Safety net: if any DataFrame leaks into to_html(), keep precision at 2.
|
|
||||||
pd.set_option("display.precision", 2)
|
|
||||||
pd.set_option("display.float_format", lambda x: f"{x:.2f}")
|
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------
|
|
||||||
# Core data compare
|
|
||||||
# -----------------------------
|
|
||||||
def compare_data_columns(
|
def compare_data_columns(
|
||||||
files: list[str],
|
files, name_column, data_column, info_cols, drop_column, debug=False
|
||||||
name_column: str,
|
|
||||||
data_column: str,
|
|
||||||
info_cols: list[str],
|
|
||||||
drop_column: str,
|
|
||||||
debug: bool = False,
|
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Align concatenation by keys derived from info_cols instead of row order.
|
Align concatenation by keys derived from info_cols instead of row order.
|
||||||
- Pick one canonical key list: subset of info_cols present in ALL files.
|
- Pick one canonical key list: subset of info_cols present in ALL files.
|
||||||
- For each file: set index to those keys, aggregate duplicates
|
- For each file: set index to those keys, aggregate duplicates
|
||||||
(mean for metric, first for names).
|
- (mean for metric, first for names).
|
||||||
- Concat along axis=1 (indexes align), then reset_index so callers can
|
- Concat along axis=1 (indexes align), then reset_index so callers can
|
||||||
group by columns.
|
- group by columns.
|
||||||
- If --debug, add a <file_label>_name column per file.
|
- If --debug, add a <file_label>_name column per file.
|
||||||
"""
|
"""
|
||||||
print("\ncompare_data_column:", data_column)
|
print("\ncompare_data_column:", data_column)
|
||||||
|
|
||||||
frames = []
|
frames = []
|
||||||
raw_data_cols: list[str] = []
|
raw_data_cols = []
|
||||||
compare_frames = []
|
compare_frames = []
|
||||||
|
|
||||||
cols_per_file: list[set] = []
|
# 1) choose a canonical key list from info_cols that exists in ALL files
|
||||||
|
cols_per_file = []
|
||||||
for f in files:
|
for f in files:
|
||||||
try:
|
try:
|
||||||
df_tmp = pd.read_json(f, orient="records")
|
df_tmp = pd.read_json(f, orient="records")
|
||||||
@@ -67,20 +40,24 @@ def compare_data_columns(
|
|||||||
|
|
||||||
key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
|
key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
|
||||||
if not key_cols:
|
if not key_cols:
|
||||||
|
# soft fallback: use any info_cols present in the first file
|
||||||
key_cols = [c for c in info_cols if c in list(cols_per_file[0])]
|
key_cols = [c for c in info_cols if c in list(cols_per_file[0])]
|
||||||
if not key_cols:
|
if not key_cols:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"No common key columns found from info_cols across the input files."
|
"No common key columns found from info_cols across the input files."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 2) build a single "meta" block (keys as columns) once, aligned by the key index
|
||||||
meta_added = False
|
meta_added = False
|
||||||
|
|
||||||
for file in files:
|
for file in files:
|
||||||
df = pd.read_json(file, orient="records")
|
df = pd.read_json(file, orient="records")
|
||||||
|
|
||||||
|
# Keep rows that actually have the compared metric (same as original behavior)
|
||||||
if drop_column in df.columns:
|
if drop_column in df.columns:
|
||||||
df = df.dropna(subset=[drop_column], ignore_index=True)
|
df = df.dropna(subset=[drop_column], ignore_index=True)
|
||||||
|
|
||||||
|
# Stabilize numeric key columns (harmless if missing)
|
||||||
for c in (
|
for c in (
|
||||||
"Input Len",
|
"Input Len",
|
||||||
"Output Len",
|
"Output Len",
|
||||||
@@ -92,26 +69,32 @@ def compare_data_columns(
|
|||||||
if c in df.columns:
|
if c in df.columns:
|
||||||
df[c] = pd.to_numeric(df[c], errors="coerce")
|
df[c] = pd.to_numeric(df[c], errors="coerce")
|
||||||
|
|
||||||
|
# Ensure all key columns exist
|
||||||
for c in key_cols:
|
for c in key_cols:
|
||||||
if c not in df.columns:
|
if c not in df.columns:
|
||||||
df[c] = pd.NA
|
df[c] = pd.NA
|
||||||
|
|
||||||
|
# Set index = key_cols and aggregate duplicates → unique MultiIndex
|
||||||
df_idx = df.set_index(key_cols, drop=False)
|
df_idx = df.set_index(key_cols, drop=False)
|
||||||
|
|
||||||
|
# meta (key columns), unique per key
|
||||||
meta = df_idx[key_cols]
|
meta = df_idx[key_cols]
|
||||||
if not meta.index.is_unique:
|
if not meta.index.is_unique:
|
||||||
meta = meta.groupby(level=key_cols, dropna=False).first()
|
meta = meta.groupby(level=key_cols, dropna=False).first()
|
||||||
|
|
||||||
|
# metric series for this file, aggregated to one row per key
|
||||||
file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
|
file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
|
||||||
s = df_idx[data_column]
|
s = df_idx[data_column]
|
||||||
if not s.index.is_unique:
|
if not s.index.is_unique:
|
||||||
s = s.groupby(level=key_cols, dropna=False).mean()
|
s = s.groupby(level=key_cols, dropna=False).mean()
|
||||||
s.name = file_label
|
s.name = file_label # column label like original
|
||||||
|
|
||||||
|
# add meta once (from first file) so keys are the leftmost columns
|
||||||
if not meta_added:
|
if not meta_added:
|
||||||
frames.append(meta)
|
frames.append(meta)
|
||||||
meta_added = True
|
meta_added = True
|
||||||
|
|
||||||
|
# (NEW) debug: aligned test-name column per file
|
||||||
if debug and name_column in df_idx.columns:
|
if debug and name_column in df_idx.columns:
|
||||||
name_s = df_idx[name_column]
|
name_s = df_idx[name_column]
|
||||||
if not name_s.index.is_unique:
|
if not name_s.index.is_unique:
|
||||||
@@ -123,19 +106,26 @@ def compare_data_columns(
|
|||||||
raw_data_cols.append(file_label)
|
raw_data_cols.append(file_label)
|
||||||
compare_frames.append(s)
|
compare_frames.append(s)
|
||||||
|
|
||||||
|
# Generalize ratio: for any file N>=2, add ratio (fileN / file1)
|
||||||
if len(compare_frames) >= 2:
|
if len(compare_frames) >= 2:
|
||||||
base = compare_frames[0]
|
base = compare_frames[0]
|
||||||
current = compare_frames[-1]
|
current = compare_frames[-1]
|
||||||
if "P99" in data_column or "Median" in data_column:
|
if "P99" in data_column or "Median" in data_column:
|
||||||
ratio = base / current
|
ratio = base / current # for latency
|
||||||
else:
|
else:
|
||||||
ratio = current / base
|
ratio = current / base
|
||||||
ratio = ratio.mask(base == 0)
|
ratio = ratio.mask(base == 0) # avoid inf when baseline is 0
|
||||||
ratio.name = f"Ratio 1 vs {len(compare_frames)}"
|
ratio.name = f"Ratio 1 vs {len(compare_frames)}"
|
||||||
frames.append(ratio)
|
frames.append(ratio)
|
||||||
|
|
||||||
concat_df = pd.concat(frames, axis=1).reset_index(drop=True)
|
# 4) concat on columns with aligned MultiIndex;
|
||||||
|
# then reset_index to return keys as columns
|
||||||
|
concat_df = pd.concat(frames, axis=1)
|
||||||
|
concat_df = concat_df.reset_index(drop=True).reset_index()
|
||||||
|
if "index" in concat_df.columns:
|
||||||
|
concat_df = concat_df.drop(columns=["index"])
|
||||||
|
|
||||||
|
# Ensure key/info columns appear first (in your info_cols order)
|
||||||
front = [c for c in info_cols if c in concat_df.columns]
|
front = [c for c in info_cols if c in concat_df.columns]
|
||||||
rest = [c for c in concat_df.columns if c not in front]
|
rest = [c for c in concat_df.columns if c not in front]
|
||||||
concat_df = concat_df[front + rest]
|
concat_df = concat_df[front + rest]
|
||||||
@@ -144,15 +134,20 @@ def compare_data_columns(
|
|||||||
return concat_df, raw_data_cols
|
return concat_df, raw_data_cols
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------
|
|
||||||
# Split helper
|
|
||||||
# -----------------------------
|
|
||||||
def split_json_by_tp_pp(
|
def split_json_by_tp_pp(
|
||||||
input_file: str = "benchmark_results.json", output_root: str = "."
|
input_file: str = "benchmark_results.json", output_root: str = "."
|
||||||
) -> list[str]:
|
) -> list[str]:
|
||||||
|
"""
|
||||||
|
Split a benchmark JSON into separate folders by (TP Size, PP Size).
|
||||||
|
|
||||||
|
Creates: <output_root>/tp{TP}_pp{PP}/benchmark_results.json
|
||||||
|
Returns: list of file paths written.
|
||||||
|
"""
|
||||||
|
# Load JSON data into DataFrame
|
||||||
with open(input_file, encoding="utf-8") as f:
|
with open(input_file, encoding="utf-8") as f:
|
||||||
data = json.load(f)
|
data = json.load(f)
|
||||||
|
|
||||||
|
# If the JSON is a dict with a list under common keys, use that list
|
||||||
if isinstance(data, dict):
|
if isinstance(data, dict):
|
||||||
for key in ("results", "serving_results", "benchmarks", "data"):
|
for key in ("results", "serving_results", "benchmarks", "data"):
|
||||||
if isinstance(data.get(key), list):
|
if isinstance(data.get(key), list):
|
||||||
@@ -161,6 +156,7 @@ def split_json_by_tp_pp(
|
|||||||
|
|
||||||
df = pd.DataFrame(data)
|
df = pd.DataFrame(data)
|
||||||
|
|
||||||
|
# Keep only "serving" tests
|
||||||
name_col = next(
|
name_col = next(
|
||||||
(c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None
|
(c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None
|
||||||
)
|
)
|
||||||
@@ -169,6 +165,7 @@ def split_json_by_tp_pp(
|
|||||||
df[name_col].astype(str).str.contains(r"serving", case=False, na=False)
|
df[name_col].astype(str).str.contains(r"serving", case=False, na=False)
|
||||||
].copy()
|
].copy()
|
||||||
|
|
||||||
|
# Handle alias column names
|
||||||
rename_map = {
|
rename_map = {
|
||||||
"tp_size": "TP Size",
|
"tp_size": "TP Size",
|
||||||
"tensor_parallel_size": "TP Size",
|
"tensor_parallel_size": "TP Size",
|
||||||
@@ -179,14 +176,21 @@ def split_json_by_tp_pp(
|
|||||||
columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True
|
columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Ensure TP/PP columns exist (default to 1 if missing)
|
||||||
if "TP Size" not in df.columns:
|
if "TP Size" not in df.columns:
|
||||||
df["TP Size"] = 1
|
df["TP Size"] = 1
|
||||||
if "PP Size" not in df.columns:
|
if "PP Size" not in df.columns:
|
||||||
df["PP Size"] = 1
|
df["PP Size"] = 1
|
||||||
|
|
||||||
df["TP Size"] = pd.to_numeric(df["TP Size"], errors="coerce").fillna(1).astype(int)
|
# make sure TP/PP are numeric ints with no NaN
|
||||||
df["PP Size"] = pd.to_numeric(df["PP Size"], errors="coerce").fillna(1).astype(int)
|
df["TP Size"] = (
|
||||||
|
pd.to_numeric(df.get("TP Size", 1), errors="coerce").fillna(1).astype(int)
|
||||||
|
)
|
||||||
|
df["PP Size"] = (
|
||||||
|
pd.to_numeric(df.get("PP Size", 1), errors="coerce").fillna(1).astype(int)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Split into separate folders
|
||||||
saved_paths: list[str] = []
|
saved_paths: list[str] = []
|
||||||
for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False):
|
for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False):
|
||||||
folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}")
|
folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}")
|
||||||
@@ -199,9 +203,32 @@ def split_json_by_tp_pp(
|
|||||||
return saved_paths
|
return saved_paths
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------
|
def _add_limit_line(fig, y_value, label):
|
||||||
# Styling helpers
|
# Visible dashed line + annotation
|
||||||
# -----------------------------
|
fig.add_hline(
|
||||||
|
y=y_value,
|
||||||
|
line_dash="dash",
|
||||||
|
line_color="red" if "ttft" in label.lower() else "blue",
|
||||||
|
annotation_text=f"{label}: {y_value} ms",
|
||||||
|
annotation_position="top left",
|
||||||
|
)
|
||||||
|
# Optional: add a legend item (as a transparent helper trace)
|
||||||
|
if plot and plotly_found:
|
||||||
|
import plotly.graph_objects as go
|
||||||
|
|
||||||
|
fig.add_trace(
|
||||||
|
go.Scatter(
|
||||||
|
x=[None],
|
||||||
|
y=[None],
|
||||||
|
mode="lines",
|
||||||
|
line=dict(
|
||||||
|
dash="dash", color="red" if "ttft" in label.lower() else "blue"
|
||||||
|
),
|
||||||
|
name=f"{label}",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _find_concurrency_col(df: pd.DataFrame) -> str:
|
def _find_concurrency_col(df: pd.DataFrame) -> str:
|
||||||
for c in [
|
for c in [
|
||||||
"# of max concurrency.",
|
"# of max concurrency.",
|
||||||
@@ -212,6 +239,7 @@ def _find_concurrency_col(df: pd.DataFrame) -> str:
|
|||||||
]:
|
]:
|
||||||
if c in df.columns:
|
if c in df.columns:
|
||||||
return c
|
return c
|
||||||
|
# Fallback: guess an integer-like column (harmless if unused)
|
||||||
for c in df.columns:
|
for c in df.columns:
|
||||||
if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
|
if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
|
||||||
return c
|
return c
|
||||||
@@ -220,7 +248,8 @@ def _find_concurrency_col(df: pd.DataFrame) -> str:
|
|||||||
|
|
||||||
def _highlight_threshold(
|
def _highlight_threshold(
|
||||||
df: pd.DataFrame, threshold: float
|
df: pd.DataFrame, threshold: float
|
||||||
) -> pd.io.formats.style.Styler:
|
) -> "pd.io.formats.style.Styler":
|
||||||
|
"""Highlight numeric per-configuration columns with value <= threshold."""
|
||||||
conc_col = _find_concurrency_col(df)
|
conc_col = _find_concurrency_col(df)
|
||||||
key_cols = [
|
key_cols = [
|
||||||
c
|
c
|
||||||
@@ -231,7 +260,6 @@ def _highlight_threshold(
|
|||||||
c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")
|
c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")
|
||||||
]
|
]
|
||||||
conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
|
conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
|
||||||
|
|
||||||
return df.style.map(
|
return df.style.map(
|
||||||
lambda v: "background-color:#e6ffe6;font-weight:bold;"
|
lambda v: "background-color:#e6ffe6;font-weight:bold;"
|
||||||
if pd.notna(v) and v <= threshold
|
if pd.notna(v) and v <= threshold
|
||||||
@@ -240,264 +268,7 @@ def _highlight_threshold(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def highlight_ratio_columns(styler: pd.io.formats.style.Styler):
|
if __name__ == "__main__":
|
||||||
ratio_cols = [c for c in styler.data.columns if "ratio" in str(c).lower()]
|
|
||||||
if not ratio_cols:
|
|
||||||
return styler
|
|
||||||
|
|
||||||
styler = styler.apply(
|
|
||||||
lambda _: ["background-color: #fff3b0"] * len(styler.data),
|
|
||||||
subset=ratio_cols,
|
|
||||||
axis=0,
|
|
||||||
)
|
|
||||||
|
|
||||||
styler = styler.set_table_styles(
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"selector": f"th.col_heading.level0.col{i}",
|
|
||||||
"props": [("background-color", "#fff3b0")],
|
|
||||||
}
|
|
||||||
for i, col in enumerate(styler.data.columns)
|
|
||||||
if col in ratio_cols
|
|
||||||
],
|
|
||||||
overwrite=False,
|
|
||||||
)
|
|
||||||
return styler
|
|
||||||
|
|
||||||
|
|
||||||
def _apply_two_decimals(
|
|
||||||
styler: pd.io.formats.style.Styler,
|
|
||||||
) -> pd.io.formats.style.Styler:
|
|
||||||
df = styler.data
|
|
||||||
num_cols = df.select_dtypes("number").columns
|
|
||||||
if len(num_cols) == 0:
|
|
||||||
return styler
|
|
||||||
return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="")
|
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------
|
|
||||||
# Valid max concurrency summary helpers
|
|
||||||
# -----------------------------
|
|
||||||
def _config_value_columns(df: pd.DataFrame, conc_col: str) -> list[str]:
|
|
||||||
key_cols = [
|
|
||||||
c
|
|
||||||
for c in ["Model", "Dataset Name", "Input Len", "Output Len"]
|
|
||||||
if c in df.columns
|
|
||||||
]
|
|
||||||
exclude = set(key_cols + [conc_col, "qps", "QPS"])
|
|
||||||
|
|
||||||
cols: list[str] = []
|
|
||||||
for c in df.columns:
|
|
||||||
if c in exclude:
|
|
||||||
continue
|
|
||||||
lc = str(c).lower()
|
|
||||||
if lc.startswith("ratio"):
|
|
||||||
continue
|
|
||||||
if lc.endswith("_name") or lc == "test name" or lc == "test_name":
|
|
||||||
continue
|
|
||||||
if pd.api.types.is_numeric_dtype(df[c]):
|
|
||||||
cols.append(c)
|
|
||||||
return cols
|
|
||||||
|
|
||||||
|
|
||||||
def _max_concurrency_ok(
|
|
||||||
df: pd.DataFrame, conc_col: str, cfg_col: str, threshold: float
|
|
||||||
):
|
|
||||||
if df is None or conc_col not in df.columns or cfg_col not in df.columns:
|
|
||||||
return pd.NA
|
|
||||||
|
|
||||||
d = df[[conc_col, cfg_col]].copy()
|
|
||||||
d[conc_col] = pd.to_numeric(d[conc_col], errors="coerce")
|
|
||||||
d[cfg_col] = pd.to_numeric(d[cfg_col], errors="coerce")
|
|
||||||
d = d.dropna(subset=[conc_col, cfg_col])
|
|
||||||
|
|
||||||
if d.empty:
|
|
||||||
return pd.NA
|
|
||||||
|
|
||||||
ok = d[d[cfg_col] <= threshold]
|
|
||||||
if ok.empty:
|
|
||||||
return pd.NA
|
|
||||||
|
|
||||||
return ok[conc_col].max()
|
|
||||||
|
|
||||||
|
|
||||||
def _value_at_concurrency(df: pd.DataFrame, conc_col: str, cfg_col: str, conc_value):
|
|
||||||
if (
|
|
||||||
df is None
|
|
||||||
or conc_col not in df.columns
|
|
||||||
or cfg_col not in df.columns
|
|
||||||
or pd.isna(conc_value)
|
|
||||||
):
|
|
||||||
return pd.NA
|
|
||||||
|
|
||||||
d = df[[conc_col, cfg_col]].copy()
|
|
||||||
d[conc_col] = pd.to_numeric(d[conc_col], errors="coerce")
|
|
||||||
d[cfg_col] = pd.to_numeric(d[cfg_col], errors="coerce")
|
|
||||||
|
|
||||||
conc_value = pd.to_numeric(conc_value, errors="coerce")
|
|
||||||
if pd.isna(conc_value):
|
|
||||||
return pd.NA
|
|
||||||
|
|
||||||
hit = d[d[conc_col] == conc_value]
|
|
||||||
if hit.empty:
|
|
||||||
return pd.NA
|
|
||||||
return hit[cfg_col].iloc[0]
|
|
||||||
|
|
||||||
|
|
||||||
def build_valid_max_concurrency_summary_html(
|
|
||||||
tput_group_df: pd.DataFrame | None,
|
|
||||||
ttft_group_df: pd.DataFrame | None,
|
|
||||||
tpot_group_df: pd.DataFrame | None,
|
|
||||||
conc_col: str,
|
|
||||||
args,
|
|
||||||
) -> str:
|
|
||||||
if ttft_group_df is None and tpot_group_df is None:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
ttft_cols = (
|
|
||||||
_config_value_columns(ttft_group_df, conc_col)
|
|
||||||
if ttft_group_df is not None
|
|
||||||
else []
|
|
||||||
)
|
|
||||||
tpot_cols = (
|
|
||||||
_config_value_columns(tpot_group_df, conc_col)
|
|
||||||
if tpot_group_df is not None
|
|
||||||
else []
|
|
||||||
)
|
|
||||||
tput_cols = (
|
|
||||||
_config_value_columns(tput_group_df, conc_col)
|
|
||||||
if tput_group_df is not None
|
|
||||||
else []
|
|
||||||
)
|
|
||||||
|
|
||||||
if ttft_group_df is not None and tpot_group_df is not None:
|
|
||||||
cfg_cols = [c for c in ttft_cols if c in tpot_cols]
|
|
||||||
if tput_group_df is not None:
|
|
||||||
cfg_cols = [c for c in cfg_cols if c in tput_cols] or cfg_cols
|
|
||||||
else:
|
|
||||||
cfg_cols = ttft_cols or tpot_cols
|
|
||||||
|
|
||||||
if not cfg_cols:
|
|
||||||
cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
|
|
||||||
|
|
||||||
rows = []
|
|
||||||
for cfg in cfg_cols:
|
|
||||||
ttft_max = (
|
|
||||||
_max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
|
|
||||||
if ttft_group_df is not None
|
|
||||||
else pd.NA
|
|
||||||
)
|
|
||||||
tpot_max = (
|
|
||||||
_max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
|
|
||||||
if tpot_group_df is not None
|
|
||||||
else pd.NA
|
|
||||||
)
|
|
||||||
both = (
|
|
||||||
pd.NA
|
|
||||||
if (pd.isna(ttft_max) or pd.isna(tpot_max))
|
|
||||||
else min(ttft_max, tpot_max)
|
|
||||||
)
|
|
||||||
|
|
||||||
tput_at_both = (
|
|
||||||
_value_at_concurrency(tput_group_df, conc_col, cfg, both)
|
|
||||||
if tput_group_df is not None
|
|
||||||
else pd.NA
|
|
||||||
)
|
|
||||||
ttft_at_both = (
|
|
||||||
_value_at_concurrency(ttft_group_df, conc_col, cfg, both)
|
|
||||||
if ttft_group_df is not None
|
|
||||||
else pd.NA
|
|
||||||
)
|
|
||||||
tpot_at_both = (
|
|
||||||
_value_at_concurrency(tpot_group_df, conc_col, cfg, both)
|
|
||||||
if tpot_group_df is not None
|
|
||||||
else pd.NA
|
|
||||||
)
|
|
||||||
|
|
||||||
rows.append(
|
|
||||||
{
|
|
||||||
"Configuration": cfg,
|
|
||||||
f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
|
|
||||||
f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
|
|
||||||
f"Max {conc_col} (Both)": both,
|
|
||||||
"Output Tput @ Both (tok/s)": tput_at_both,
|
|
||||||
"TTFT @ Both (ms)": ttft_at_both,
|
|
||||||
"TPOT @ Both (ms)": tpot_at_both,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
summary_df = pd.DataFrame(rows)
|
|
||||||
|
|
||||||
# --- Coerce numeric columns so Styler doesn't miss them due to object dtype ---
|
|
||||||
for c in summary_df.columns:
|
|
||||||
if c == "Configuration":
|
|
||||||
continue
|
|
||||||
summary_df[c] = pd.to_numeric(summary_df[c], errors="coerce")
|
|
||||||
|
|
||||||
both_col = f"Max {conc_col} (Both)"
|
|
||||||
|
|
||||||
# --- Strict 2-decimal formatting for ALL non-Configuration columns ---
|
|
||||||
formatters = {}
|
|
||||||
for c in summary_df.columns:
|
|
||||||
if c == "Configuration":
|
|
||||||
continue
|
|
||||||
# default argument binds per-column formatter correctly
|
|
||||||
formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}"
|
|
||||||
|
|
||||||
styler = summary_df.style.format(formatters)
|
|
||||||
|
|
||||||
def _green(v):
|
|
||||||
return "background-color:#e6ffe6;font-weight:bold;" if pd.notna(v) else ""
|
|
||||||
|
|
||||||
if both_col in summary_df.columns:
|
|
||||||
styler = styler.map(_green, subset=[both_col])
|
|
||||||
|
|
||||||
title = (
|
|
||||||
'<div style="font-size: 1.15em; font-weight: 700; margin: 12px 0 6px 0;">'
|
|
||||||
"Valid Max Concurrency Summary"
|
|
||||||
"</div>\n"
|
|
||||||
)
|
|
||||||
return title + styler.to_html(table_attributes='border="1" class="dataframe"')
|
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------
|
|
||||||
# Plot helper
|
|
||||||
# -----------------------------
|
|
||||||
def _add_limit_line(fig, y_value: float, label: str):
|
|
||||||
fig.add_hline(
|
|
||||||
y=y_value,
|
|
||||||
line_dash="dash",
|
|
||||||
line_color="red" if "ttft" in label.lower() else "blue",
|
|
||||||
annotation_text=f"{label}: {y_value} ms",
|
|
||||||
annotation_position="top left",
|
|
||||||
)
|
|
||||||
if plotly_found:
|
|
||||||
import plotly.graph_objects as go
|
|
||||||
|
|
||||||
fig.add_trace(
|
|
||||||
go.Scatter(
|
|
||||||
x=[None],
|
|
||||||
y=[None],
|
|
||||||
mode="lines",
|
|
||||||
line=dict(
|
|
||||||
dash="dash",
|
|
||||||
color="red" if "ttft" in label.lower() else "blue",
|
|
||||||
),
|
|
||||||
name=label,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------
|
|
||||||
# Refactored main + group-first report
|
|
||||||
# -----------------------------
|
|
||||||
@dataclass(frozen=True)
|
|
||||||
class MetricPlan:
|
|
||||||
data_cols: list[str]
|
|
||||||
drop_column: str
|
|
||||||
|
|
||||||
|
|
||||||
def build_parser() -> argparse.ArgumentParser:
|
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-f", "--file", action="append", type=str, help="input file name"
|
"-f", "--file", action="append", type=str, help="input file name"
|
||||||
@@ -537,289 +308,149 @@ def build_parser() -> argparse.ArgumentParser:
|
|||||||
default=100.0,
|
default=100.0,
|
||||||
help="Reference limit for TPOT plots (ms)",
|
help="Reference limit for TPOT plots (ms)",
|
||||||
)
|
)
|
||||||
return parser
|
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
def choose_metrics(latency: str) -> MetricPlan:
|
|
||||||
latency = (latency or "").lower()
|
|
||||||
drop_column = "P99"
|
drop_column = "P99"
|
||||||
|
name_column = "Test name"
|
||||||
|
info_cols = [
|
||||||
|
"Model",
|
||||||
|
"Dataset Name",
|
||||||
|
"Input Len",
|
||||||
|
"Output Len",
|
||||||
|
"TP Size",
|
||||||
|
"PP Size",
|
||||||
|
"# of max concurrency.",
|
||||||
|
"qps",
|
||||||
|
]
|
||||||
|
|
||||||
if "median" in latency:
|
if "median" in args.latency:
|
||||||
return MetricPlan(
|
data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
|
||||||
data_cols=["Output Tput (tok/s)", "Median TTFT (ms)", "Median"],
|
html_msgs_for_data_cols = [
|
||||||
drop_column=drop_column,
|
"Compare Output Tokens /n",
|
||||||
)
|
"Median TTFT /n",
|
||||||
|
"Median TPOT /n",
|
||||||
return MetricPlan(
|
]
|
||||||
data_cols=["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"],
|
drop_column = "P99"
|
||||||
drop_column=drop_column,
|
elif "p99" in args.latency:
|
||||||
)
|
data_cols_to_compare = ["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"]
|
||||||
|
html_msgs_for_data_cols = [
|
||||||
|
"Compare Output Tokens /n",
|
||||||
def prepare_input_files(args, info_cols: list[str]) -> tuple[list[str], list[str]]:
|
"P99 TTFT /n",
|
||||||
if not args.file:
|
"P99 TPOT /n",
|
||||||
raise ValueError("No input files provided. Use -f/--file.")
|
]
|
||||||
|
|
||||||
if len(args.file) == 1:
|
if len(args.file) == 1:
|
||||||
files = split_json_by_tp_pp(args.file[0], output_root="splits")
|
files = split_json_by_tp_pp(args.file[0], output_root="splits")
|
||||||
info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")]
|
info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")]
|
||||||
else:
|
else:
|
||||||
files = args.file
|
files = args.file
|
||||||
|
|
||||||
return files, info_cols
|
|
||||||
|
|
||||||
|
|
||||||
def get_y_axis_col(info_cols: list[str], xaxis: str) -> str:
|
|
||||||
y_axis_index = info_cols.index(xaxis) if xaxis in info_cols else 6
|
|
||||||
return info_cols[y_axis_index]
|
|
||||||
|
|
||||||
|
|
||||||
def get_group_cols(output_df: pd.DataFrame, info_cols: list[str]) -> list[str]:
|
|
||||||
filtered_info_cols = info_cols[:4]
|
|
||||||
group_cols = [c for c in filtered_info_cols if c in output_df.columns]
|
|
||||||
if not group_cols:
|
|
||||||
raise ValueError(
|
|
||||||
f"No valid group-by columns. Expected subset: {filtered_info_cols}, "
|
|
||||||
f"but DataFrame has: {list(output_df.columns)}"
|
|
||||||
)
|
|
||||||
return group_cols
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_group_key(name):
|
|
||||||
return name if isinstance(name, tuple) else (name,)
|
|
||||||
|
|
||||||
|
|
||||||
def group_filename(name, prefix: str = "perf_comparison_") -> str:
|
|
||||||
name_vals = normalize_group_key(name)
|
|
||||||
safe = ",".join(map(str, name_vals)).replace(",", "_").replace("/", "-")
|
|
||||||
return f"{prefix}{safe}.html"
|
|
||||||
|
|
||||||
|
|
||||||
def build_group_suffix(group_cols: list[str], name) -> str:
|
|
||||||
name_vals = normalize_group_key(name)
|
|
||||||
return " , ".join(f"{col} : [ {val} ] " for col, val in zip(group_cols, name_vals))
|
|
||||||
|
|
||||||
|
|
||||||
def render_metric_table_html(
|
|
||||||
display_group: pd.DataFrame,
|
|
||||||
metric_label: str,
|
|
||||||
group_suffix: str,
|
|
||||||
args,
|
|
||||||
) -> str:
|
|
||||||
title = (
|
|
||||||
f'<div style="font-size: 1.25em; font-weight: 600; margin: 12px 0;">'
|
|
||||||
f"{_html.escape(metric_label)}"
|
|
||||||
f" — {_html.escape(group_suffix)}"
|
|
||||||
f"</div>\n"
|
|
||||||
)
|
|
||||||
|
|
||||||
metric_name = metric_label.lower()
|
|
||||||
if "ttft" in metric_name:
|
|
||||||
styler = _highlight_threshold(display_group, args.ttft_max_ms)
|
|
||||||
elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
|
|
||||||
styler = _highlight_threshold(display_group, args.tpot_max_ms)
|
|
||||||
else:
|
|
||||||
styler = display_group.style
|
|
||||||
|
|
||||||
styler = _apply_two_decimals(styler)
|
|
||||||
styler = highlight_ratio_columns(styler)
|
|
||||||
|
|
||||||
return title + styler.to_html(table_attributes='border="1" class="dataframe"')
|
|
||||||
|
|
||||||
|
|
||||||
def maybe_write_plot(
|
|
||||||
main_fh,
|
|
||||||
sub_fh,
|
|
||||||
group_df: pd.DataFrame,
|
|
||||||
raw_data_cols: list[str],
|
|
||||||
metric_label: str,
|
|
||||||
y_axis_col: str,
|
|
||||||
args,
|
|
||||||
):
|
|
||||||
if not (args.plot and plotly_found):
|
|
||||||
return
|
|
||||||
|
|
||||||
import plotly.express as px
|
|
||||||
|
|
||||||
df = group_df[raw_data_cols].sort_values(by=y_axis_col)
|
|
||||||
df_melted = df.melt(
|
|
||||||
id_vars=y_axis_col,
|
|
||||||
var_name="Configuration",
|
|
||||||
value_name=metric_label,
|
|
||||||
)
|
|
||||||
|
|
||||||
fig = px.line(
|
|
||||||
df_melted,
|
|
||||||
x=y_axis_col,
|
|
||||||
y=metric_label,
|
|
||||||
color="Configuration",
|
|
||||||
title=f"{metric_label} vs {y_axis_col}",
|
|
||||||
markers=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Ensure plot hover + y tick labels are also 2 decimals.
|
|
||||||
fig.update_traces(hovertemplate="%{y:.2f}<extra></extra>")
|
|
||||||
fig.update_yaxes(tickformat=".2f")
|
|
||||||
|
|
||||||
metric_name = metric_label.lower()
|
|
||||||
if "ttft" in metric_name:
|
|
||||||
_add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
|
|
||||||
elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
|
|
||||||
_add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
|
|
||||||
|
|
||||||
html = fig.to_html(full_html=True, include_plotlyjs="cdn")
|
|
||||||
main_fh.write(html)
|
|
||||||
sub_fh.write(html)
|
|
||||||
|
|
||||||
|
|
||||||
def build_group_keys(
|
|
||||||
df: pd.DataFrame, group_cols: list[str], sort_cols: list[str] | None = None
|
|
||||||
):
|
|
||||||
if sort_cols:
|
|
||||||
df = df.sort_values(by=sort_cols)
|
|
||||||
gb = df.groupby(group_cols, dropna=False)
|
|
||||||
return [k for k, _ in gb]
|
|
||||||
|
|
||||||
|
|
||||||
def write_report_group_first(
|
|
||||||
files: list[str], info_cols: list[str], plan: MetricPlan, args
|
|
||||||
):
|
|
||||||
name_column = "Test name"
|
|
||||||
y_axis_col = get_y_axis_col(info_cols, args.xaxis)
|
|
||||||
|
|
||||||
print("comparing : " + ", ".join(files))
|
print("comparing : " + ", ".join(files))
|
||||||
|
debug = args.debug
|
||||||
metric_cache: dict[str, tuple[pd.DataFrame, list[str]]] = {}
|
plot = args.plot
|
||||||
group_cols_canonical: list[str] | None = None
|
# For Plot feature, assign y axis from one of info_cols
|
||||||
|
y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6
|
||||||
for metric_label in plan.data_cols:
|
with open("perf_comparison.html", "w") as text_file:
|
||||||
output_df, raw_data_cols = compare_data_columns(
|
for i in range(len(data_cols_to_compare)):
|
||||||
files,
|
output_df, raw_data_cols = compare_data_columns(
|
||||||
name_column,
|
files,
|
||||||
metric_label,
|
name_column,
|
||||||
info_cols,
|
data_cols_to_compare[i],
|
||||||
plan.drop_column,
|
info_cols,
|
||||||
debug=args.debug,
|
drop_column,
|
||||||
)
|
debug=debug,
|
||||||
|
|
||||||
raw_data_cols = list(raw_data_cols)
|
|
||||||
raw_data_cols.insert(0, y_axis_col)
|
|
||||||
|
|
||||||
group_cols = get_group_cols(output_df, info_cols)
|
|
||||||
if group_cols_canonical is None:
|
|
||||||
group_cols_canonical = group_cols
|
|
||||||
else:
|
|
||||||
group_cols_canonical = [c for c in group_cols_canonical if c in group_cols]
|
|
||||||
|
|
||||||
metric_cache[metric_label] = (
|
|
||||||
output_df.sort_values(by=args.xaxis),
|
|
||||||
raw_data_cols,
|
|
||||||
)
|
|
||||||
|
|
||||||
if not group_cols_canonical:
|
|
||||||
raise ValueError("No canonical group columns found across metrics.")
|
|
||||||
|
|
||||||
first_metric = plan.data_cols[0]
|
|
||||||
first_df_sorted, _ = metric_cache[first_metric]
|
|
||||||
group_keys = build_group_keys(
|
|
||||||
first_df_sorted, group_cols_canonical, sort_cols=[args.xaxis]
|
|
||||||
)
|
|
||||||
|
|
||||||
metric_groupbys = {
|
|
||||||
metric_label: df.groupby(group_cols_canonical, dropna=False)
|
|
||||||
for metric_label, (df, _) in metric_cache.items()
|
|
||||||
}
|
|
||||||
|
|
||||||
with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
|
|
||||||
main_fh.write('<meta charset="utf-8">\n')
|
|
||||||
for gkey in group_keys:
|
|
||||||
gkey_tuple = normalize_group_key(gkey)
|
|
||||||
suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
|
|
||||||
sub_path = group_filename(gkey_tuple)
|
|
||||||
group_header = (
|
|
||||||
'<div style="font-size: 1.4em; font-weight: 700; '
|
|
||||||
'margin: 18px 0 10px 0;">'
|
|
||||||
f"{_html.escape(suffix)}"
|
|
||||||
"</div>\n"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
main_fh.write(group_header)
|
# For Plot feature, insert y axis from one of info_cols
|
||||||
with open(sub_path, "w", encoding="utf-8") as sub_fh:
|
raw_data_cols.insert(0, info_cols[y_axis_index])
|
||||||
sub_fh.write('<meta charset="utf-8">\n')
|
|
||||||
sub_fh.write(group_header)
|
|
||||||
tput_group_df = None
|
|
||||||
ttft_group_df = None
|
|
||||||
tpot_group_df = None
|
|
||||||
conc_col = args.xaxis
|
|
||||||
|
|
||||||
for metric_label in plan.data_cols:
|
filtered_info_cols = info_cols[:-2]
|
||||||
gb = metric_groupbys[metric_label]
|
existing_group_cols = [
|
||||||
df_sorted, raw_data_cols = metric_cache[metric_label]
|
c for c in filtered_info_cols if c in output_df.columns
|
||||||
|
]
|
||||||
|
if not existing_group_cols:
|
||||||
|
raise ValueError(
|
||||||
|
f"No valid group-by columns "
|
||||||
|
f"Expected subset: {filtered_info_cols}, "
|
||||||
|
f"but DataFrame has: {list(output_df.columns)}"
|
||||||
|
)
|
||||||
|
# output_df_sorted = output_df.sort_values(by=existing_group_cols)
|
||||||
|
output_df_sorted = output_df.sort_values(by=args.xaxis)
|
||||||
|
output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
|
||||||
|
for name, group in output_groups:
|
||||||
|
group_name = (
|
||||||
|
",".join(map(str, name)).replace(",", "_").replace("/", "-")
|
||||||
|
)
|
||||||
|
group_html_name = "perf_comparison_" + group_name + ".html"
|
||||||
|
|
||||||
try:
|
metric_name = str(data_cols_to_compare[i]).lower()
|
||||||
group_df = gb.get_group(gkey)
|
if "tok/s" in metric_name:
|
||||||
except KeyError:
|
html = group.to_html()
|
||||||
missing = (
|
elif "ttft" in metric_name:
|
||||||
'<div style="font-size: 1.1em; font-weight: 600; '
|
styler = _highlight_threshold(group, args.ttft_max_ms).format(
|
||||||
'margin: 10px 0;">'
|
{c: "{:.2f}" for c in group.select_dtypes("number").columns},
|
||||||
f"{_html.escape(metric_label)} — missing for this group"
|
na_rep="—",
|
||||||
"</div>\n"
|
)
|
||||||
|
html = styler.to_html(
|
||||||
|
table_attributes='border="1" class="dataframe"'
|
||||||
|
)
|
||||||
|
elif (
|
||||||
|
"tpot" in metric_name
|
||||||
|
or "median" in metric_name
|
||||||
|
or "p99" in metric_name
|
||||||
|
):
|
||||||
|
styler = _highlight_threshold(group, args.tpot_max_ms).format(
|
||||||
|
{c: "{:.2f}" for c in group.select_dtypes("number").columns},
|
||||||
|
na_rep="—",
|
||||||
|
)
|
||||||
|
html = styler.to_html(
|
||||||
|
table_attributes='border="1" class="dataframe"'
|
||||||
|
)
|
||||||
|
|
||||||
|
text_file.write(html_msgs_for_data_cols[i])
|
||||||
|
text_file.write(html)
|
||||||
|
with open(group_html_name, "a+") as sub_text_file:
|
||||||
|
sub_text_file.write(html_msgs_for_data_cols[i])
|
||||||
|
sub_text_file.write(html)
|
||||||
|
|
||||||
|
if plot and plotly_found:
|
||||||
|
import plotly.express as px
|
||||||
|
|
||||||
|
df = group[raw_data_cols]
|
||||||
|
df_sorted = df.sort_values(by=info_cols[y_axis_index])
|
||||||
|
# Melt DataFrame for plotting
|
||||||
|
df_melted = df_sorted.melt(
|
||||||
|
id_vars=info_cols[y_axis_index],
|
||||||
|
var_name="Configuration",
|
||||||
|
value_name=data_cols_to_compare[i],
|
||||||
|
)
|
||||||
|
title = (
|
||||||
|
data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
|
||||||
|
)
|
||||||
|
# Create Plotly line chart
|
||||||
|
fig = px.line(
|
||||||
|
df_melted,
|
||||||
|
x=info_cols[y_axis_index],
|
||||||
|
y=data_cols_to_compare[i],
|
||||||
|
color="Configuration",
|
||||||
|
title=title,
|
||||||
|
markers=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
main_fh.write(missing)
|
# ---- Add threshold lines based on metric name ----
|
||||||
sub_fh.write(missing)
|
if "ttft" in metric_name:
|
||||||
continue
|
_add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
|
||||||
|
elif (
|
||||||
|
"tpot" in metric_name
|
||||||
|
or "median" in metric_name
|
||||||
|
or "p99" in metric_name
|
||||||
|
):
|
||||||
|
_add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
|
||||||
|
|
||||||
if conc_col not in group_df.columns:
|
# Export to HTML
|
||||||
conc_col = _find_concurrency_col(group_df)
|
text_file.write(
|
||||||
|
fig.to_html(full_html=True, include_plotlyjs="cdn")
|
||||||
mn = metric_label.lower().strip()
|
)
|
||||||
if "tok/s" in mn:
|
sub_text_file.write(
|
||||||
tput_group_df = group_df
|
fig.to_html(full_html=True, include_plotlyjs="cdn")
|
||||||
elif "ttft" in mn:
|
)
|
||||||
ttft_group_df = group_df
|
|
||||||
elif mn in ("p99", "median") or "tpot" in mn:
|
|
||||||
tpot_group_df = group_df
|
|
||||||
|
|
||||||
display_group = group_df.drop(
|
|
||||||
columns=group_cols_canonical, errors="ignore"
|
|
||||||
)
|
|
||||||
|
|
||||||
html = render_metric_table_html(
|
|
||||||
display_group, metric_label, suffix, args
|
|
||||||
)
|
|
||||||
main_fh.write(html)
|
|
||||||
sub_fh.write(html)
|
|
||||||
|
|
||||||
maybe_write_plot(
|
|
||||||
main_fh,
|
|
||||||
sub_fh,
|
|
||||||
group_df=group_df,
|
|
||||||
raw_data_cols=raw_data_cols,
|
|
||||||
metric_label=metric_label,
|
|
||||||
y_axis_col=y_axis_col,
|
|
||||||
args=args,
|
|
||||||
)
|
|
||||||
|
|
||||||
summary_html = build_valid_max_concurrency_summary_html(
|
|
||||||
tput_group_df=tput_group_df,
|
|
||||||
ttft_group_df=ttft_group_df,
|
|
||||||
tpot_group_df=tpot_group_df,
|
|
||||||
conc_col=conc_col,
|
|
||||||
args=args,
|
|
||||||
)
|
|
||||||
if summary_html:
|
|
||||||
main_fh.write(summary_html)
|
|
||||||
sub_fh.write(summary_html)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
args = build_parser().parse_args()
|
|
||||||
info_cols = list(DEFAULT_INFO_COLS)
|
|
||||||
plan = choose_metrics(args.latency)
|
|
||||||
files, info_cols = prepare_input_files(args, info_cols)
|
|
||||||
write_report_group_first(files, info_cols, plan, args)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
|
|||||||
24
.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
Executable file → Normal file
24
.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
Executable file → Normal file
@@ -49,11 +49,7 @@ check_cpus() {
|
|||||||
echo "Need at least 1 NUMA to run benchmarking."
|
echo "Need at least 1 NUMA to run benchmarking."
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
if [[ "$(uname -m)" == "aarch64" ]] || [[ "$(uname -m)" == "arm64" ]]; then
|
declare -g gpu_type="cpu"
|
||||||
declare -g gpu_type="arm64-cpu"
|
|
||||||
else
|
|
||||||
declare -g gpu_type="cpu"
|
|
||||||
fi
|
|
||||||
echo "GPU type is $gpu_type"
|
echo "GPU type is $gpu_type"
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -211,8 +207,8 @@ run_latency_tests() {
|
|||||||
|
|
||||||
# check if there is enough GPU to run the test
|
# check if there is enough GPU to run the test
|
||||||
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
|
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
|
||||||
if [[ "$ON_CPU" == "1" ]]; then
|
if [ "$ON_CPU" == "1" ]; then
|
||||||
pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size // 1')
|
pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
|
||||||
world_size=$(($tp*$pp))
|
world_size=$(($tp*$pp))
|
||||||
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
|
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
|
||||||
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||||
@@ -280,8 +276,8 @@ run_throughput_tests() {
|
|||||||
|
|
||||||
# check if there is enough GPU to run the test
|
# check if there is enough GPU to run the test
|
||||||
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
|
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
|
||||||
if [[ "$ON_CPU" == "1" ]]; then
|
if [ "$ON_CPU" == "1" ]; then
|
||||||
pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size // 1')
|
pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
|
||||||
world_size=$(($tp*$pp))
|
world_size=$(($tp*$pp))
|
||||||
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
|
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
|
||||||
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||||
@@ -397,8 +393,8 @@ run_serving_tests() {
|
|||||||
|
|
||||||
# check if there is enough resources to run the test
|
# check if there is enough resources to run the test
|
||||||
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
|
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
|
||||||
if [[ "$ON_CPU" == "1" ]]; then
|
if [ "$ON_CPU" == "1" ]; then
|
||||||
pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size // 1')
|
pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
|
||||||
world_size=$(($tp*$pp))
|
world_size=$(($tp*$pp))
|
||||||
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
|
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
|
||||||
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||||
@@ -500,9 +496,9 @@ run_serving_tests() {
|
|||||||
main() {
|
main() {
|
||||||
local ARCH
|
local ARCH
|
||||||
ARCH=''
|
ARCH=''
|
||||||
if [[ "$ON_CPU" == "1" ]]; then
|
if [ "$ON_CPU" == "1" ];then
|
||||||
check_cpus
|
check_cpus
|
||||||
ARCH="-$gpu_type"
|
ARCH='-cpu'
|
||||||
else
|
else
|
||||||
check_gpus
|
check_gpus
|
||||||
ARCH="$arch_suffix"
|
ARCH="$arch_suffix"
|
||||||
|
|||||||
@@ -1,26 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "latency_llama8B_tp1",
|
|
||||||
"environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"num_iters_warmup": 5,
|
|
||||||
"num_iters": 15
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@@ -1,130 +0,0 @@
|
|||||||
{
|
|
||||||
"defaults": {
|
|
||||||
"qps_list": [
|
|
||||||
"inf"
|
|
||||||
],
|
|
||||||
"max_concurrency_list": [
|
|
||||||
12,
|
|
||||||
16,
|
|
||||||
24,
|
|
||||||
32,
|
|
||||||
64,
|
|
||||||
128,
|
|
||||||
200
|
|
||||||
],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"tests": [
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp2_sharegpt",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 2
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp2_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 2
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp1_random_128_2048",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 2048
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp2_random_128_2048",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 2
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 2048
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp1_random_2048_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 2048,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp2_random_2048_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 2
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 2048,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
@@ -19,8 +19,10 @@
|
|||||||
"block_size": 128,
|
"block_size": 128,
|
||||||
"trust_remote_code": "",
|
"trust_remote_code": "",
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
"max_num_batched_tokens": 2048,
|
"max_num_batched_tokens": 2048,
|
||||||
"max_num_seqs": 256
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
@@ -149,45 +151,6 @@
|
|||||||
"random-output-len": 128
|
"random-output-len": 128
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_tp2_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"tensor_parallel_size": 2
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_tp4_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"tensor_parallel_size": 4
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama3B_tp1_random_128_128",
|
"test_name": "serving_llama3B_tp1_random_128_128",
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
|
|||||||
@@ -1,27 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "throughput_llama8B_tp1",
|
|
||||||
"environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200,
|
|
||||||
"backend": "vllm"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
steps:
|
steps:
|
||||||
# aarch64 + CUDA builds
|
# aarch64 + CUDA builds
|
||||||
- label: "Build wheel - aarch64 - CUDA 12.9"
|
- label: "Build arm64 wheel - CUDA 12.9"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-wheel-arm64-cuda-12-9
|
id: build-wheel-arm64-cuda-12-9
|
||||||
agents:
|
agents:
|
||||||
@@ -11,11 +11,11 @@ steps:
|
|||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh"
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- label: "Build wheel - aarch64 - CUDA 13.0"
|
- label: "Build arm64 wheel - CUDA 13.0"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-wheel-arm64-cuda-13-0
|
id: build-wheel-arm64-cuda-13-0
|
||||||
agents:
|
agents:
|
||||||
@@ -26,12 +26,12 @@ steps:
|
|||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
# aarch64 build
|
# aarch64 build
|
||||||
- label: "Build wheel - aarch64 - CPU"
|
- label: "Build arm64 CPU wheel"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-wheel-arm64-cpu
|
id: build-wheel-arm64-cpu
|
||||||
agents:
|
agents:
|
||||||
@@ -40,39 +40,39 @@ steps:
|
|||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
# x86 + CUDA builds
|
# x86 + CUDA builds
|
||||||
- label: "Build wheel - x86_64 - CUDA 12.9"
|
- label: "Build wheel - CUDA 12.9"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-wheel-x86-cuda-12-9
|
id: build-wheel-cuda-12-9
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31"
|
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_31"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- label: "Build wheel - x86_64 - CUDA 13.0"
|
- label: "Build wheel - CUDA 13.0"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-wheel-x86-cuda-13-0
|
id: build-wheel-cuda-13-0
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
# x86 CPU wheel build
|
# x86 CPU wheel build
|
||||||
- label: "Build wheel - x86_64 - CPU"
|
- label: "Build x86 CPU wheel"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-wheel-x86-cpu
|
id: build-wheel-x86-cpu
|
||||||
agents:
|
agents:
|
||||||
@@ -81,12 +81,12 @@ steps:
|
|||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
# Build release images (CUDA 12.9)
|
# Build release images (12.9)
|
||||||
- label: "Build release image - x86_64 - CUDA 12.9"
|
- label: "Build release image (x86)"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-release-image-x86
|
id: build-release-image-x86
|
||||||
agents:
|
agents:
|
||||||
@@ -99,7 +99,7 @@ steps:
|
|||||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
|
|
||||||
- label: "Build release image - aarch64 - CUDA 12.9"
|
- label: "Build release image (arm64)"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-release-image-arm64
|
id: build-release-image-arm64
|
||||||
agents:
|
agents:
|
||||||
@@ -109,66 +109,27 @@ steps:
|
|||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||||
|
|
||||||
- label: "Create multi-arch manifest - CUDA 12.9"
|
# Add job to create multi-arch manifest
|
||||||
|
- label: "Create multi-arch manifest"
|
||||||
depends_on:
|
depends_on:
|
||||||
- build-release-image-x86
|
- build-release-image-x86
|
||||||
- build-release-image-arm64
|
- build-release-image-arm64
|
||||||
id: create-multi-arch-manifest
|
id: create-multi-arch-manifest
|
||||||
agents:
|
agents:
|
||||||
queue: small_cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
|
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
|
||||||
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
|
|
||||||
- label: "Annotate release workflow - CUDA 12.9"
|
- label: "Annotate release workflow"
|
||||||
depends_on:
|
depends_on:
|
||||||
- create-multi-arch-manifest
|
- create-multi-arch-manifest
|
||||||
id: annotate-release-workflow
|
id: annotate-release-workflow
|
||||||
agents:
|
|
||||||
queue: small_cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "bash .buildkite/scripts/annotate-release.sh"
|
|
||||||
|
|
||||||
- block: "Build CUDA 13.0 release images"
|
|
||||||
key: block-release-image-build-cuda-13-0
|
|
||||||
depends_on: ~
|
|
||||||
|
|
||||||
- label: "Build release image - x86_64 - CUDA 13.0"
|
|
||||||
depends_on: block-release-image-build-cuda-13-0
|
|
||||||
id: build-release-image-x86-cuda-13-0
|
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "bash .buildkite/scripts/annotate-release.sh"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
|
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
|
|
||||||
# re-tag to default image tag and push, just in case arm64 build fails
|
|
||||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
|
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
|
|
||||||
|
|
||||||
- label: "Build release image - aarch64 - CUDA 13.0"
|
|
||||||
depends_on: block-release-image-build-cuda-13-0
|
|
||||||
id: build-release-image-arm64-cuda-13-0
|
|
||||||
agents:
|
|
||||||
queue: arm64_cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
|
||||||
# compute capability 12.0 for RTX-50 series / RTX PRO 6000 Blackwell, 12.1 for DGX Spark
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
|
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
|
|
||||||
|
|
||||||
- label: "Create multi-arch manifest - CUDA 13.0"
|
|
||||||
depends_on:
|
|
||||||
- build-release-image-x86-cuda-13-0
|
|
||||||
- build-release-image-arm64-cuda-13-0
|
|
||||||
id: create-multi-arch-manifest-cuda-13-0
|
|
||||||
agents:
|
|
||||||
queue: small_cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
|
||||||
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend"
|
|
||||||
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
|
|
||||||
|
|
||||||
- input: "Provide Release version here"
|
- input: "Provide Release version here"
|
||||||
id: input-release-version
|
id: input-release-version
|
||||||
@@ -176,26 +137,6 @@ steps:
|
|||||||
- text: "What is the release version?"
|
- text: "What is the release version?"
|
||||||
key: release-version
|
key: release-version
|
||||||
|
|
||||||
- block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
|
|
||||||
key: block-upload-release-wheels
|
|
||||||
depends_on:
|
|
||||||
- input-release-version
|
|
||||||
- build-wheel-x86-cuda-12-9
|
|
||||||
- build-wheel-x86-cuda-13-0
|
|
||||||
- build-wheel-x86-cpu
|
|
||||||
- build-wheel-arm64-cuda-12-9
|
|
||||||
- build-wheel-arm64-cuda-13-0
|
|
||||||
- build-wheel-arm64-cpu
|
|
||||||
|
|
||||||
- label: "Upload release wheels to PyPI and GitHub"
|
|
||||||
depends_on:
|
|
||||||
- block-upload-release-wheels
|
|
||||||
id: upload-release-wheels
|
|
||||||
agents:
|
|
||||||
queue: small_cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "bash .buildkite/scripts/upload-release-wheels.sh"
|
|
||||||
|
|
||||||
- block: "Build CPU release image"
|
- block: "Build CPU release image"
|
||||||
key: block-cpu-release-image-build
|
key: block-cpu-release-image-build
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
@@ -228,31 +169,24 @@ steps:
|
|||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- block: "Build ROCm release image"
|
|
||||||
key: block-rocm-release-image-build
|
|
||||||
depends_on: ~
|
|
||||||
|
|
||||||
- label: "Build release image (ROCm)"
|
|
||||||
depends_on: block-rocm-release-image-build
|
|
||||||
id: build-release-image-rocm
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
|
||||||
# Build base image first
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --tag rocm/vllm-dev:base-$BUILDKITE_COMMIT --target final --progress plain -f docker/Dockerfile.rocm_base ."
|
|
||||||
# Build vLLM ROCm image using the base
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg BASE_IMAGE=rocm/vllm-dev:base-$BUILDKITE_COMMIT --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm --target vllm-openai --progress plain -f docker/Dockerfile.rocm ."
|
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm"
|
|
||||||
|
|
||||||
- label: "Build and publish nightly multi-arch image to DockerHub"
|
- label: "Build and publish nightly multi-arch image to DockerHub"
|
||||||
depends_on:
|
depends_on:
|
||||||
- create-multi-arch-manifest
|
- create-multi-arch-manifest
|
||||||
if: build.env("NIGHTLY") == "1"
|
if: build.env("NIGHTLY") == "1"
|
||||||
agents:
|
agents:
|
||||||
queue: small_cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "bash .buildkite/scripts/push-nightly-builds.sh"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
|
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
|
||||||
|
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
|
||||||
|
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
|
||||||
|
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
|
||||||
|
- "docker push vllm/vllm-openai:nightly-x86_64"
|
||||||
|
- "docker push vllm/vllm-openai:nightly-aarch64"
|
||||||
|
- "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
|
||||||
|
- "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
|
||||||
|
- "docker manifest push vllm/vllm-openai:nightly"
|
||||||
|
- "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
|
||||||
# Clean up old nightly builds (keep only last 14)
|
# Clean up old nightly builds (keep only last 14)
|
||||||
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
|
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
|
||||||
plugins:
|
plugins:
|
||||||
@@ -262,384 +196,3 @@ steps:
|
|||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
DOCKERHUB_USERNAME: "vllmbot"
|
DOCKERHUB_USERNAME: "vllmbot"
|
||||||
|
|
||||||
- label: "Build and publish nightly multi-arch image to DockerHub - CUDA 13.0"
|
|
||||||
depends_on:
|
|
||||||
- create-multi-arch-manifest-cuda-13-0
|
|
||||||
if: build.env("NIGHTLY") == "1"
|
|
||||||
agents:
|
|
||||||
queue: small_cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "bash .buildkite/scripts/push-nightly-builds.sh cu130"
|
|
||||||
# Clean up old nightly builds (keep only last 14)
|
|
||||||
- "bash .buildkite/scripts/cleanup-nightly-builds.sh cu130-nightly-"
|
|
||||||
plugins:
|
|
||||||
- docker-login#v3.0.0:
|
|
||||||
username: vllmbot
|
|
||||||
password-env: DOCKERHUB_TOKEN
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
DOCKERHUB_USERNAME: "vllmbot"
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# ROCm Release Pipeline (x86_64 only)
|
|
||||||
# =============================================================================
|
|
||||||
#
|
|
||||||
# vLLM version is determined by the Buildkite checkout (like CUDA pipeline).
|
|
||||||
# To build a specific version, trigger the build from that branch/tag.
|
|
||||||
#
|
|
||||||
# Environment variables for ROCm builds (set via Buildkite UI or schedule):
|
|
||||||
# ROCM_PYTHON_VERSION: Python version (default: 3.12)
|
|
||||||
# PYTORCH_ROCM_ARCH: GPU architectures (default: gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151)
|
|
||||||
# ROCM_UPLOAD_WHEELS: Upload to S3 (default: false for nightly, true for releases)
|
|
||||||
# ROCM_FORCE_REBUILD: Force rebuild base wheels, ignore S3 cache (default: false)
|
|
||||||
#
|
|
||||||
# Note: ROCm version is determined by BASE_IMAGE in docker/Dockerfile.rocm_base
|
|
||||||
# (currently rocm/dev-ubuntu-22.04:7.1-complete)
|
|
||||||
#
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
# ROCm Input Step - Collect build configuration (manual trigger only)
|
|
||||||
- input: "ROCm Wheel Release Build Configuration"
|
|
||||||
key: input-rocm-config
|
|
||||||
depends_on: ~
|
|
||||||
if: build.source == "ui"
|
|
||||||
fields:
|
|
||||||
- text: "Python Version"
|
|
||||||
key: "rocm-python-version"
|
|
||||||
default: "3.12"
|
|
||||||
hint: "Python version (e.g., 3.12)"
|
|
||||||
- text: "GPU Architectures"
|
|
||||||
key: "rocm-pytorch-rocm-arch"
|
|
||||||
default: "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151"
|
|
||||||
hint: "Semicolon-separated GPU architectures"
|
|
||||||
- select: "Upload Wheels to S3"
|
|
||||||
key: "rocm-upload-wheels"
|
|
||||||
default: "true"
|
|
||||||
options:
|
|
||||||
- label: "No - Build only (nightly/dev)"
|
|
||||||
value: "false"
|
|
||||||
- label: "Yes - Upload to S3 (release)"
|
|
||||||
value: "true"
|
|
||||||
- select: "Force Rebuild Base Wheels"
|
|
||||||
key: "rocm-force-rebuild"
|
|
||||||
default: "false"
|
|
||||||
hint: "Ignore S3 cache and rebuild base wheels from scratch"
|
|
||||||
options:
|
|
||||||
- label: "No - Use cached wheels if available"
|
|
||||||
value: "false"
|
|
||||||
- label: "Yes - Rebuild even if cache exists"
|
|
||||||
value: "true"
|
|
||||||
|
|
||||||
# ROCm Job 1: Build ROCm Base Wheels (with S3 caching)
|
|
||||||
- label: ":rocm: Build ROCm Base Wheels"
|
|
||||||
id: build-rocm-base-wheels
|
|
||||||
depends_on:
|
|
||||||
- step: input-rocm-config
|
|
||||||
allow_failure: true # Allow failure so non-UI builds can proceed (input step is skipped)
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
# Set configuration and check cache
|
|
||||||
- |
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
# Get values from meta-data (set by input step) or use defaults
|
|
||||||
PYTHON_VERSION="$$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo '')"
|
|
||||||
export PYTHON_VERSION="$${PYTHON_VERSION:-3.12}"
|
|
||||||
|
|
||||||
PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
|
|
||||||
export PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
|
|
||||||
|
|
||||||
# Check for force rebuild flag
|
|
||||||
ROCM_FORCE_REBUILD="$${ROCM_FORCE_REBUILD:-}"
|
|
||||||
if [ -z "$${ROCM_FORCE_REBUILD}" ]; then
|
|
||||||
ROCM_FORCE_REBUILD="$$(buildkite-agent meta-data get rocm-force-rebuild 2>/dev/null || echo '')"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "========================================"
|
|
||||||
echo "ROCm Base Wheels Build Configuration"
|
|
||||||
echo "========================================"
|
|
||||||
echo " PYTHON_VERSION: $${PYTHON_VERSION}"
|
|
||||||
echo " PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}"
|
|
||||||
echo " ROCM_FORCE_REBUILD: $${ROCM_FORCE_REBUILD:-false}"
|
|
||||||
echo "========================================"
|
|
||||||
|
|
||||||
# Save resolved config for later jobs
|
|
||||||
buildkite-agent meta-data set "rocm-python-version" "$${PYTHON_VERSION}"
|
|
||||||
buildkite-agent meta-data set "rocm-pytorch-rocm-arch" "$${PYTORCH_ROCM_ARCH}"
|
|
||||||
|
|
||||||
# Check S3 cache for pre-built wheels
|
|
||||||
CACHE_KEY=$$(.buildkite/scripts/cache-rocm-base-wheels.sh key)
|
|
||||||
CACHE_PATH=$$(.buildkite/scripts/cache-rocm-base-wheels.sh path)
|
|
||||||
echo ""
|
|
||||||
echo "Cache key: $${CACHE_KEY}"
|
|
||||||
echo "Cache path: $${CACHE_PATH}"
|
|
||||||
|
|
||||||
# Save cache key for downstream jobs
|
|
||||||
buildkite-agent meta-data set "rocm-cache-key" "$${CACHE_KEY}"
|
|
||||||
|
|
||||||
CACHE_STATUS="miss"
|
|
||||||
if [ "$${ROCM_FORCE_REBUILD}" != "true" ]; then
|
|
||||||
CACHE_STATUS=$$(.buildkite/scripts/cache-rocm-base-wheels.sh check)
|
|
||||||
else
|
|
||||||
echo "Force rebuild requested, skipping cache check"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ "$${CACHE_STATUS}" = "hit" ]; then
|
|
||||||
echo ""
|
|
||||||
echo "CACHE HIT! Downloading pre-built wheels..."
|
|
||||||
echo ""
|
|
||||||
.buildkite/scripts/cache-rocm-base-wheels.sh download
|
|
||||||
|
|
||||||
# Set the S3 path for the cached Docker image (for Job 2 to download)
|
|
||||||
S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}"
|
|
||||||
buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
|
|
||||||
|
|
||||||
# Mark that we used cache (for Docker image handling)
|
|
||||||
buildkite-agent meta-data set "rocm-used-cache" "true"
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "Cache download complete. Skipping Docker build."
|
|
||||||
echo "Docker image will be downloaded from: $${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
|
|
||||||
else
|
|
||||||
echo ""
|
|
||||||
echo "CACHE MISS. Building from scratch..."
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
# Build full base image (for later vLLM build)
|
|
||||||
DOCKER_BUILDKIT=1 docker buildx build \
|
|
||||||
--file docker/Dockerfile.rocm_base \
|
|
||||||
--tag rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} \
|
|
||||||
--build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
|
|
||||||
--build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \
|
|
||||||
--build-arg USE_SCCACHE=1 \
|
|
||||||
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
|
|
||||||
--build-arg SCCACHE_REGION_NAME=us-west-2 \
|
|
||||||
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
|
|
||||||
--load \
|
|
||||||
.
|
|
||||||
|
|
||||||
# Build debs_wheel_release stage for wheel extraction
|
|
||||||
DOCKER_BUILDKIT=1 docker buildx build \
|
|
||||||
--file docker/Dockerfile.rocm_base \
|
|
||||||
--tag rocm-base-debs:$${BUILDKITE_BUILD_NUMBER} \
|
|
||||||
--target debs_wheel_release \
|
|
||||||
--build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
|
|
||||||
--build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \
|
|
||||||
--build-arg USE_SCCACHE=1 \
|
|
||||||
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
|
|
||||||
--build-arg SCCACHE_REGION_NAME=us-west-2 \
|
|
||||||
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
|
|
||||||
--load \
|
|
||||||
.
|
|
||||||
|
|
||||||
# Extract wheels from Docker image
|
|
||||||
mkdir -p artifacts/rocm-base-wheels
|
|
||||||
container_id=$$(docker create rocm-base-debs:$${BUILDKITE_BUILD_NUMBER})
|
|
||||||
docker cp $${container_id}:/app/debs/. artifacts/rocm-base-wheels/
|
|
||||||
docker rm $${container_id}
|
|
||||||
echo "Extracted base wheels:"
|
|
||||||
ls -lh artifacts/rocm-base-wheels/
|
|
||||||
|
|
||||||
# Upload wheels to S3 cache for future builds
|
|
||||||
echo ""
|
|
||||||
echo "Uploading wheels to S3 cache..."
|
|
||||||
.buildkite/scripts/cache-rocm-base-wheels.sh upload
|
|
||||||
|
|
||||||
# Export base Docker image for reuse in vLLM build
|
|
||||||
mkdir -p artifacts/rocm-docker-image
|
|
||||||
docker save rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} | gzip > artifacts/rocm-docker-image/rocm-base-image.tar.gz
|
|
||||||
echo "Docker image size:"
|
|
||||||
ls -lh artifacts/rocm-docker-image/
|
|
||||||
|
|
||||||
# Upload large Docker image to S3 (also cached by cache key)
|
|
||||||
S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}"
|
|
||||||
echo "Uploading Docker image to $${S3_ARTIFACT_PATH}/"
|
|
||||||
aws s3 cp artifacts/rocm-docker-image/rocm-base-image.tar.gz "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
|
|
||||||
|
|
||||||
# Save the S3 path for downstream jobs
|
|
||||||
buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
|
|
||||||
|
|
||||||
# Mark that we did NOT use cache
|
|
||||||
buildkite-agent meta-data set "rocm-used-cache" "false"
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "Build complete. Wheels cached for future builds."
|
|
||||||
fi
|
|
||||||
artifact_paths:
|
|
||||||
- "artifacts/rocm-base-wheels/*.whl"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
S3_BUCKET: "vllm-wheels"
|
|
||||||
|
|
||||||
# ROCm Job 2: Build vLLM ROCm Wheel
|
|
||||||
- label: ":python: Build vLLM ROCm Wheel"
|
|
||||||
id: build-rocm-vllm-wheel
|
|
||||||
depends_on:
|
|
||||||
- step: build-rocm-base-wheels
|
|
||||||
allow_failure: false
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
timeout_in_minutes: 180
|
|
||||||
commands:
|
|
||||||
# Download artifacts and prepare Docker image
|
|
||||||
- |
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
# Ensure git tags are up-to-date (Buildkite's default fetch doesn't update tags)
|
|
||||||
# This fixes version detection when tags are moved/force-pushed
|
|
||||||
echo "Fetching latest tags from origin..."
|
|
||||||
git fetch --tags --force origin
|
|
||||||
|
|
||||||
# Log tag information for debugging version detection
|
|
||||||
echo "========================================"
|
|
||||||
echo "Git Tag Verification"
|
|
||||||
echo "========================================"
|
|
||||||
echo "Current HEAD: $(git rev-parse HEAD)"
|
|
||||||
echo "git describe --tags: $(git describe --tags 2>/dev/null || echo 'No tags found')"
|
|
||||||
echo ""
|
|
||||||
echo "Recent tags (pointing to commits near HEAD):"
|
|
||||||
git tag -l --sort=-creatordate | head -5
|
|
||||||
echo "setuptools_scm version detection:"
|
|
||||||
pip install -q setuptools_scm 2>/dev/null || true
|
|
||||||
python3 -c "import setuptools_scm; print(' Detected version:', setuptools_scm.get_version())" 2>/dev/null || echo " (setuptools_scm not available in this environment)"
|
|
||||||
echo "========================================"
|
|
||||||
|
|
||||||
# Download wheel artifacts from current build
|
|
||||||
echo "Downloading wheel artifacts from current build"
|
|
||||||
buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
|
|
||||||
|
|
||||||
# Download Docker image from S3 (too large for Buildkite artifacts)
|
|
||||||
DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')"
|
|
||||||
if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then
|
|
||||||
echo "ERROR: rocm-docker-image-s3-path metadata not found"
|
|
||||||
echo "This should have been set by the build-rocm-base-wheels job"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "Downloading Docker image from $${DOCKER_IMAGE_S3_PATH}"
|
|
||||||
mkdir -p artifacts/rocm-docker-image
|
|
||||||
aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz
|
|
||||||
|
|
||||||
# Load base Docker image and capture the tag
|
|
||||||
echo "Loading base Docker image..."
|
|
||||||
LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load)
|
|
||||||
echo "$${LOAD_OUTPUT}"
|
|
||||||
# Extract the actual loaded image tag from "Loaded image: <tag>" output
|
|
||||||
# This avoids picking up stale images (like rocm/vllm-dev:nightly) already on the agent
|
|
||||||
BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //')
|
|
||||||
if [ -z "$${BASE_IMAGE_TAG}" ]; then
|
|
||||||
echo "ERROR: Failed to extract image tag from docker load output"
|
|
||||||
echo "Load output was: $${LOAD_OUTPUT}"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "Loaded base image: $${BASE_IMAGE_TAG}"
|
|
||||||
|
|
||||||
# Prepare base wheels for Docker build context
|
|
||||||
mkdir -p docker/context/base-wheels
|
|
||||||
touch docker/context/base-wheels/.keep
|
|
||||||
cp artifacts/rocm-base-wheels/*.whl docker/context/base-wheels/
|
|
||||||
echo "Base wheels for vLLM build:"
|
|
||||||
ls -lh docker/context/base-wheels/
|
|
||||||
|
|
||||||
# Get GPU architectures from meta-data
|
|
||||||
PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
|
|
||||||
PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
|
|
||||||
|
|
||||||
echo "========================================"
|
|
||||||
echo "Building vLLM wheel with:"
|
|
||||||
echo " BUILDKITE_COMMIT: $${BUILDKITE_COMMIT}"
|
|
||||||
echo " BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}"
|
|
||||||
echo " PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}"
|
|
||||||
echo " BASE_IMAGE: $${BASE_IMAGE_TAG}"
|
|
||||||
echo "========================================"
|
|
||||||
|
|
||||||
# Build vLLM wheel using local checkout (REMOTE_VLLM=0)
|
|
||||||
DOCKER_BUILDKIT=1 docker build \
|
|
||||||
--file docker/Dockerfile.rocm \
|
|
||||||
--target export_vllm_wheel_release \
|
|
||||||
--output type=local,dest=rocm-dist \
|
|
||||||
--build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \
|
|
||||||
--build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
|
|
||||||
--build-arg REMOTE_VLLM=0 \
|
|
||||||
--build-arg GIT_REPO_CHECK=1 \
|
|
||||||
--build-arg USE_SCCACHE=1 \
|
|
||||||
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
|
|
||||||
--build-arg SCCACHE_REGION_NAME=us-west-2 \
|
|
||||||
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
|
|
||||||
.
|
|
||||||
|
|
||||||
echo "Built vLLM wheel:"
|
|
||||||
ls -lh rocm-dist/*.whl
|
|
||||||
|
|
||||||
# Copy wheel to artifacts directory
|
|
||||||
mkdir -p artifacts/rocm-vllm-wheel
|
|
||||||
cp rocm-dist/*.whl artifacts/rocm-vllm-wheel/
|
|
||||||
echo "Final vLLM wheel:"
|
|
||||||
ls -lh artifacts/rocm-vllm-wheel/
|
|
||||||
artifact_paths:
|
|
||||||
- "artifacts/rocm-vllm-wheel/*.whl"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
S3_BUCKET: "vllm-wheels"
|
|
||||||
|
|
||||||
# ROCm Job 3: Upload Wheels to S3
|
|
||||||
- label: ":s3: Upload ROCm Wheels to S3"
|
|
||||||
id: upload-rocm-wheels
|
|
||||||
depends_on:
|
|
||||||
- step: build-rocm-vllm-wheel
|
|
||||||
allow_failure: false
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
commands:
|
|
||||||
# Download all wheel artifacts and run upload
|
|
||||||
- |
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
# Check if upload is enabled (from env var, meta-data, or release branch)
|
|
||||||
ROCM_UPLOAD_WHEELS="$${ROCM_UPLOAD_WHEELS:-}"
|
|
||||||
if [ -z "$${ROCM_UPLOAD_WHEELS}" ]; then
|
|
||||||
# Try to get from meta-data (input form)
|
|
||||||
ROCM_UPLOAD_WHEELS="$$(buildkite-agent meta-data get rocm-upload-wheels 2>/dev/null || echo '')"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "========================================"
|
|
||||||
echo "Upload check:"
|
|
||||||
echo " ROCM_UPLOAD_WHEELS: $${ROCM_UPLOAD_WHEELS}"
|
|
||||||
echo " BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}"
|
|
||||||
echo "========================================"
|
|
||||||
|
|
||||||
# Skip upload if not enabled
|
|
||||||
if [ "$${ROCM_UPLOAD_WHEELS}" != "true" ]; then
|
|
||||||
echo "Skipping S3 upload (ROCM_UPLOAD_WHEELS != true, NIGHTLY != 1, not a release branch)"
|
|
||||||
echo "To enable upload, set 'Upload Wheels to S3' to 'Yes' in the build configuration"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Upload enabled, proceeding..."
|
|
||||||
|
|
||||||
# Download artifacts from current build
|
|
||||||
echo "Downloading artifacts from current build"
|
|
||||||
buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
|
|
||||||
buildkite-agent artifact download "artifacts/rocm-vllm-wheel/*.whl" .
|
|
||||||
|
|
||||||
# Run upload script
|
|
||||||
bash .buildkite/scripts/upload-rocm-wheels.sh
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
S3_BUCKET: "vllm-wheels"
|
|
||||||
|
|
||||||
# ROCm Job 4: Annotate ROCm Wheel Release
|
|
||||||
- label: ":memo: Annotate ROCm wheel release"
|
|
||||||
id: annotate-rocm-release
|
|
||||||
depends_on:
|
|
||||||
- step: upload-rocm-wheels
|
|
||||||
allow_failure: true
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "bash .buildkite/scripts/annotate-rocm-release.sh"
|
|
||||||
env:
|
|
||||||
S3_BUCKET: "vllm-wheels"
|
|
||||||
|
|||||||
@@ -32,7 +32,6 @@ To download and upload the image:
|
|||||||
\`\`\`
|
\`\`\`
|
||||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
|
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
|
||||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
|
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
|
||||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
|
|
||||||
|
|
||||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
|
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
|
||||||
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
|
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
|
||||||
@@ -46,12 +45,6 @@ docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
|||||||
docker push vllm/vllm-openai:latest-aarch64
|
docker push vllm/vllm-openai:latest-aarch64
|
||||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||||
|
|
||||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai:rocm
|
|
||||||
docker tag vllm/vllm-openai:rocm vllm/vllm-openai:latest-rocm
|
|
||||||
docker tag vllm/vllm-openai:rocm vllm/vllm-openai:v${RELEASE_VERSION}-rocm
|
|
||||||
docker push vllm/vllm-openai:latest-rocm
|
|
||||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-rocm
|
|
||||||
|
|
||||||
docker manifest rm vllm/vllm-openai:latest
|
docker manifest rm vllm/vllm-openai:latest
|
||||||
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
|
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
|
||||||
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||||
|
|||||||
@@ -1,74 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
#
|
|
||||||
# Generate Buildkite annotation for ROCm wheel release
|
|
||||||
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
# Get build configuration from meta-data
|
|
||||||
# Extract ROCm version dynamically from Dockerfile.rocm_base
|
|
||||||
# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.1-complete -> extracts "7.1"
|
|
||||||
ROCM_VERSION=$(grep -E '^ARG BASE_IMAGE=' docker/Dockerfile.rocm_base | sed -E 's/.*:([0-9]+\.[0-9]+).*/\1/' || echo "unknown")
|
|
||||||
PYTHON_VERSION=$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo "3.12")
|
|
||||||
PYTORCH_ROCM_ARCH=$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
|
|
||||||
|
|
||||||
# S3 URLs
|
|
||||||
S3_BUCKET="${S3_BUCKET:-vllm-wheels}"
|
|
||||||
S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
|
|
||||||
S3_URL="https://${S3_BUCKET}.s3.${S3_REGION}.amazonaws.com"
|
|
||||||
ROCM_PATH="rocm/${BUILDKITE_COMMIT}"
|
|
||||||
|
|
||||||
buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
|
|
||||||
## :rocm: ROCm Wheel Release
|
|
||||||
|
|
||||||
### Build Configuration
|
|
||||||
| Setting | Value |
|
|
||||||
|---------|-------|
|
|
||||||
| **ROCm Version** | ${ROCM_VERSION} |
|
|
||||||
| **Python Version** | ${PYTHON_VERSION} |
|
|
||||||
| **GPU Architectures** | ${PYTORCH_ROCM_ARCH} |
|
|
||||||
| **Branch** | \`${BUILDKITE_BRANCH}\` |
|
|
||||||
| **Commit** | \`${BUILDKITE_COMMIT}\` |
|
|
||||||
|
|
||||||
### :package: Installation
|
|
||||||
|
|
||||||
**Install from this build (by commit):**
|
|
||||||
\`\`\`bash
|
|
||||||
uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/{rocm_variant}/
|
|
||||||
|
|
||||||
# Example:
|
|
||||||
uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/rocm700/
|
|
||||||
\`\`\`
|
|
||||||
|
|
||||||
**Install from nightly (if published):**
|
|
||||||
\`\`\`bash
|
|
||||||
uv pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/
|
|
||||||
\`\`\`
|
|
||||||
|
|
||||||
### :floppy_disk: Download Wheels Directly
|
|
||||||
|
|
||||||
\`\`\`bash
|
|
||||||
# List all ROCm wheels
|
|
||||||
aws s3 ls s3://${S3_BUCKET}/${ROCM_PATH}/
|
|
||||||
|
|
||||||
# Download specific wheels
|
|
||||||
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/vllm-*.whl .
|
|
||||||
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torch-*.whl .
|
|
||||||
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/triton_rocm-*.whl .
|
|
||||||
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torchvision-*.whl .
|
|
||||||
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/amdsmi-*.whl .
|
|
||||||
\`\`\`
|
|
||||||
|
|
||||||
### :gear: Included Packages
|
|
||||||
- **vllm**: vLLM with ROCm support
|
|
||||||
- **torch**: PyTorch built for ROCm ${ROCM_VERSION}
|
|
||||||
- **triton_rocm**: Triton built for ROCm
|
|
||||||
- **torchvision**: TorchVision for ROCm PyTorch
|
|
||||||
- **amdsmi**: AMD SMI Python bindings
|
|
||||||
|
|
||||||
### :warning: Notes
|
|
||||||
- These wheels are built for **ROCm ${ROCM_VERSION}** and will NOT work with CUDA GPUs
|
|
||||||
- Supported GPU architectures: ${PYTORCH_ROCM_ARCH}
|
|
||||||
- Platform: Linux x86_64 only
|
|
||||||
EOF
|
|
||||||
@@ -1,140 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
#
|
|
||||||
# Cache helper for ROCm base wheels
|
|
||||||
#
|
|
||||||
# This script manages caching of pre-built ROCm base wheels (torch, triton, etc.)
|
|
||||||
# to avoid rebuilding them when Dockerfile.rocm_base hasn't changed.
|
|
||||||
#
|
|
||||||
# Usage:
|
|
||||||
# cache-rocm-base-wheels.sh check - Check if cache exists, outputs "hit" or "miss"
|
|
||||||
# cache-rocm-base-wheels.sh upload - Upload wheels to cache
|
|
||||||
# cache-rocm-base-wheels.sh download - Download wheels from cache
|
|
||||||
# cache-rocm-base-wheels.sh key - Output the cache key
|
|
||||||
#
|
|
||||||
# Environment variables:
|
|
||||||
# S3_BUCKET - S3 bucket name (default: vllm-wheels)
|
|
||||||
# PYTHON_VERSION - Python version (affects cache key)
|
|
||||||
# PYTORCH_ROCM_ARCH - GPU architectures (affects cache key)
|
|
||||||
#
|
|
||||||
# Note: ROCm version is determined by BASE_IMAGE in Dockerfile.rocm_base,
|
|
||||||
# so changes to ROCm version are captured by the Dockerfile hash.
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
BUCKET="${S3_BUCKET:-vllm-wheels}"
|
|
||||||
DOCKERFILE="docker/Dockerfile.rocm_base"
|
|
||||||
CACHE_PREFIX="rocm/cache"
|
|
||||||
|
|
||||||
# Generate hash from Dockerfile content + build args
|
|
||||||
generate_cache_key() {
|
|
||||||
# Include Dockerfile content
|
|
||||||
if [[ ! -f "$DOCKERFILE" ]]; then
|
|
||||||
echo "ERROR: Dockerfile not found: $DOCKERFILE" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
local dockerfile_hash=$(sha256sum "$DOCKERFILE" | cut -c1-16)
|
|
||||||
|
|
||||||
# Include key build args that affect the output
|
|
||||||
# These should match the ARGs in Dockerfile.rocm_base that change the build output
|
|
||||||
# Note: ROCm version is determined by BASE_IMAGE in the Dockerfile, so it's captured by dockerfile_hash
|
|
||||||
local args_string="${PYTHON_VERSION:-}|${PYTORCH_ROCM_ARCH:-}"
|
|
||||||
local args_hash=$(echo "$args_string" | sha256sum | cut -c1-8)
|
|
||||||
|
|
||||||
echo "${dockerfile_hash}-${args_hash}"
|
|
||||||
}
|
|
||||||
|
|
||||||
CACHE_KEY=$(generate_cache_key)
|
|
||||||
CACHE_PATH="s3://${BUCKET}/${CACHE_PREFIX}/${CACHE_KEY}/"
|
|
||||||
|
|
||||||
case "${1:-}" in
|
|
||||||
check)
|
|
||||||
echo "Checking cache for key: ${CACHE_KEY}" >&2
|
|
||||||
echo "Cache path: ${CACHE_PATH}" >&2
|
|
||||||
echo "Variables used in cache key:" >&2
|
|
||||||
echo " PYTHON_VERSION: ${PYTHON_VERSION:-<not set>}" >&2
|
|
||||||
echo " PYTORCH_ROCM_ARCH: ${PYTORCH_ROCM_ARCH:-<not set>}" >&2
|
|
||||||
|
|
||||||
# Check if cache exists by listing objects
|
|
||||||
# We look for at least one .whl file
|
|
||||||
echo "Running: aws s3 ls ${CACHE_PATH}" >&2
|
|
||||||
S3_OUTPUT=$(aws s3 ls "${CACHE_PATH}" 2>&1) || true
|
|
||||||
echo "S3 ls output:" >&2
|
|
||||||
echo "$S3_OUTPUT" | head -5 >&2
|
|
||||||
|
|
||||||
if echo "$S3_OUTPUT" | grep -q "\.whl"; then
|
|
||||||
echo "hit"
|
|
||||||
else
|
|
||||||
echo "miss"
|
|
||||||
fi
|
|
||||||
;;
|
|
||||||
|
|
||||||
upload)
|
|
||||||
echo "========================================"
|
|
||||||
echo "Uploading wheels to cache"
|
|
||||||
echo "========================================"
|
|
||||||
echo "Cache key: ${CACHE_KEY}"
|
|
||||||
echo "Cache path: ${CACHE_PATH}"
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
if [[ ! -d "artifacts/rocm-base-wheels" ]]; then
|
|
||||||
echo "ERROR: artifacts/rocm-base-wheels directory not found" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
|
|
||||||
if [[ "$WHEEL_COUNT" -eq 0 ]]; then
|
|
||||||
echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Uploading $WHEEL_COUNT wheels..."
|
|
||||||
aws s3 cp --recursive artifacts/rocm-base-wheels/ "${CACHE_PATH}"
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "Cache upload complete!"
|
|
||||||
echo "========================================"
|
|
||||||
;;
|
|
||||||
|
|
||||||
download)
|
|
||||||
echo "========================================"
|
|
||||||
echo "Downloading wheels from cache"
|
|
||||||
echo "========================================"
|
|
||||||
echo "Cache key: ${CACHE_KEY}"
|
|
||||||
echo "Cache path: ${CACHE_PATH}"
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
mkdir -p artifacts/rocm-base-wheels
|
|
||||||
aws s3 cp --recursive "${CACHE_PATH}" artifacts/rocm-base-wheels/
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "Downloaded wheels:"
|
|
||||||
ls -lh artifacts/rocm-base-wheels/
|
|
||||||
|
|
||||||
WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
|
|
||||||
echo ""
|
|
||||||
echo "Total: $WHEEL_COUNT wheels"
|
|
||||||
echo "========================================"
|
|
||||||
;;
|
|
||||||
|
|
||||||
key)
|
|
||||||
echo "${CACHE_KEY}"
|
|
||||||
;;
|
|
||||||
|
|
||||||
path)
|
|
||||||
echo "${CACHE_PATH}"
|
|
||||||
;;
|
|
||||||
|
|
||||||
*)
|
|
||||||
echo "Usage: $0 {check|upload|download|key|path}" >&2
|
|
||||||
echo "" >&2
|
|
||||||
echo "Commands:" >&2
|
|
||||||
echo " check - Check if cache exists, outputs 'hit' or 'miss'" >&2
|
|
||||||
echo " upload - Upload wheels from artifacts/rocm-base-wheels/ to cache" >&2
|
|
||||||
echo " download - Download wheels from cache to artifacts/rocm-base-wheels/" >&2
|
|
||||||
echo " key - Output the cache key" >&2
|
|
||||||
echo " path - Output the full S3 cache path" >&2
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
@@ -3,14 +3,7 @@
|
|||||||
set -ex
|
set -ex
|
||||||
|
|
||||||
# Clean up old nightly builds from DockerHub, keeping only the last 14 builds
|
# Clean up old nightly builds from DockerHub, keeping only the last 14 builds
|
||||||
# This script uses DockerHub API to list and delete old tags with specified prefix
|
# This script uses DockerHub API to list and delete old tags with "nightly-" prefix
|
||||||
# Usage: cleanup-nightly-builds.sh [TAG_PREFIX]
|
|
||||||
# Example: cleanup-nightly-builds.sh "nightly-" or cleanup-nightly-builds.sh "cu130-nightly-"
|
|
||||||
|
|
||||||
# Get tag prefix from argument, default to "nightly-" if not provided
|
|
||||||
TAG_PREFIX="${1:-nightly-}"
|
|
||||||
|
|
||||||
echo "Cleaning up tags with prefix: $TAG_PREFIX"
|
|
||||||
|
|
||||||
# DockerHub API endpoint for vllm/vllm-openai repository
|
# DockerHub API endpoint for vllm/vllm-openai repository
|
||||||
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
|
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
|
||||||
@@ -52,7 +45,7 @@ get_all_tags() {
|
|||||||
set -x
|
set -x
|
||||||
|
|
||||||
# Get both last_updated timestamp and tag name, separated by |
|
# Get both last_updated timestamp and tag name, separated by |
|
||||||
local tags=$(echo "$response" | jq -r --arg prefix "$TAG_PREFIX" '.results[] | select(.name | startswith($prefix)) | "\(.last_updated)|\(.name)"')
|
local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
|
||||||
|
|
||||||
if [ -z "$tags" ]; then
|
if [ -z "$tags" ]; then
|
||||||
break
|
break
|
||||||
|
|||||||
@@ -16,18 +16,6 @@ from urllib.parse import quote
|
|||||||
|
|
||||||
import regex as re
|
import regex as re
|
||||||
|
|
||||||
|
|
||||||
def normalize_package_name(name: str) -> str:
|
|
||||||
"""
|
|
||||||
Normalize package name according to PEP 503.
|
|
||||||
https://peps.python.org/pep-0503/#normalized-names
|
|
||||||
|
|
||||||
Replace runs of underscores, hyphens, and periods with a single hyphen,
|
|
||||||
and lowercase the result.
|
|
||||||
"""
|
|
||||||
return re.sub(r"[-_.]+", "-", name).lower()
|
|
||||||
|
|
||||||
|
|
||||||
if not sys.version_info >= (3, 12):
|
if not sys.version_info >= (3, 12):
|
||||||
raise RuntimeError("This script requires Python 3.12 or higher.")
|
raise RuntimeError("This script requires Python 3.12 or higher.")
|
||||||
|
|
||||||
@@ -90,13 +78,7 @@ def parse_from_filename(file: str) -> WheelFileInfo:
|
|||||||
version = version.removesuffix("." + variant)
|
version = version.removesuffix("." + variant)
|
||||||
else:
|
else:
|
||||||
if "+" in version:
|
if "+" in version:
|
||||||
version_part, suffix = version.split("+", 1)
|
version, variant = version.split("+")
|
||||||
# Only treat known patterns as variants (rocmXXX, cuXXX, cpu)
|
|
||||||
# Git hashes and other suffixes are NOT variants
|
|
||||||
if suffix.startswith(("rocm", "cu", "cpu")):
|
|
||||||
variant = suffix
|
|
||||||
version = version_part
|
|
||||||
# Otherwise keep the full version string (variant stays None)
|
|
||||||
|
|
||||||
return WheelFileInfo(
|
return WheelFileInfo(
|
||||||
package_name=package_name,
|
package_name=package_name,
|
||||||
@@ -224,26 +206,6 @@ def generate_index_and_metadata(
|
|||||||
print("No wheel files found, skipping index generation.")
|
print("No wheel files found, skipping index generation.")
|
||||||
return
|
return
|
||||||
|
|
||||||
# For ROCm builds: inherit variant from vllm wheel
|
|
||||||
# All ROCm wheels should share the same variant as vllm
|
|
||||||
rocm_variant = None
|
|
||||||
for file in parsed_files:
|
|
||||||
if (
|
|
||||||
file.package_name == "vllm"
|
|
||||||
and file.variant
|
|
||||||
and file.variant.startswith("rocm")
|
|
||||||
):
|
|
||||||
rocm_variant = file.variant
|
|
||||||
print(f"Detected ROCm variant from vllm: {rocm_variant}")
|
|
||||||
break
|
|
||||||
|
|
||||||
# Apply ROCm variant to all wheels without a variant
|
|
||||||
if rocm_variant:
|
|
||||||
for file in parsed_files:
|
|
||||||
if file.variant is None:
|
|
||||||
file.variant = rocm_variant
|
|
||||||
print(f"Inherited variant '{rocm_variant}' for {file.filename}")
|
|
||||||
|
|
||||||
# Group by variant
|
# Group by variant
|
||||||
variant_to_files: dict[str, list[WheelFileInfo]] = {}
|
variant_to_files: dict[str, list[WheelFileInfo]] = {}
|
||||||
for file in parsed_files:
|
for file in parsed_files:
|
||||||
@@ -294,8 +256,8 @@ def generate_index_and_metadata(
|
|||||||
|
|
||||||
variant_dir.mkdir(parents=True, exist_ok=True)
|
variant_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
# gather all package names in this variant (normalized per PEP 503)
|
# gather all package names in this variant
|
||||||
packages = set(normalize_package_name(f.package_name) for f in files)
|
packages = set(f.package_name for f in files)
|
||||||
if variant == "default":
|
if variant == "default":
|
||||||
# these packages should also appear in the "project list"
|
# these packages should also appear in the "project list"
|
||||||
# generate after all variants are processed
|
# generate after all variants are processed
|
||||||
@@ -307,10 +269,8 @@ def generate_index_and_metadata(
|
|||||||
f.write(project_list_str)
|
f.write(project_list_str)
|
||||||
|
|
||||||
for package in packages:
|
for package in packages:
|
||||||
# filter files belonging to this package only (compare normalized names)
|
# filter files belonging to this package only
|
||||||
package_files = [
|
package_files = [f for f in files if f.package_name == package]
|
||||||
f for f in files if normalize_package_name(f.package_name) == package
|
|
||||||
]
|
|
||||||
package_dir = variant_dir / package
|
package_dir = variant_dir / package
|
||||||
package_dir.mkdir(parents=True, exist_ok=True)
|
package_dir.mkdir(parents=True, exist_ok=True)
|
||||||
index_str, metadata_str = generate_package_index_and_metadata(
|
index_str, metadata_str = generate_package_index_and_metadata(
|
||||||
@@ -331,7 +291,6 @@ if __name__ == "__main__":
|
|||||||
"""
|
"""
|
||||||
Arguments:
|
Arguments:
|
||||||
--version <version> : version string for the current build (e.g., commit hash)
|
--version <version> : version string for the current build (e.g., commit hash)
|
||||||
--wheel-dir <wheel_directory> : directory containing wheel files (default to be same as `version`)
|
|
||||||
--current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
|
--current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
|
||||||
--output-dir <output_directory> : directory to store generated index files
|
--output-dir <output_directory> : directory to store generated index files
|
||||||
--alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
|
--alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
|
||||||
@@ -359,12 +318,6 @@ if __name__ == "__main__":
|
|||||||
required=True,
|
required=True,
|
||||||
help="Directory to store generated index files",
|
help="Directory to store generated index files",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
|
||||||
"--wheel-dir",
|
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
help="Directory containing wheel files (default to be same as `version`)",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--alias-to-default",
|
"--alias-to-default",
|
||||||
type=str,
|
type=str,
|
||||||
@@ -381,13 +334,8 @@ if __name__ == "__main__":
|
|||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
version = args.version
|
version = args.version
|
||||||
# Allow rocm/ prefix, reject other slashes and all backslashes
|
if "/" in version or "\\" in version:
|
||||||
if "\\" in version:
|
raise ValueError("Version string must not contain slashes.")
|
||||||
raise ValueError("Version string must not contain backslashes.")
|
|
||||||
if "/" in version and not version.startswith("rocm/"):
|
|
||||||
raise ValueError(
|
|
||||||
"Version string must not contain slashes (except for 'rocm/' prefix)."
|
|
||||||
)
|
|
||||||
current_objects_path = Path(args.current_objects)
|
current_objects_path = Path(args.current_objects)
|
||||||
output_dir = Path(args.output_dir)
|
output_dir = Path(args.output_dir)
|
||||||
if not output_dir.exists():
|
if not output_dir.exists():
|
||||||
@@ -424,7 +372,7 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
|
print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
|
||||||
|
|
||||||
# keep only "official" files for a non-nightly version (specified by cli args)
|
# keep only "official" files for a non-nightly version (specifed by cli args)
|
||||||
PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$")
|
PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$")
|
||||||
if PY_VERSION_RE.match(version):
|
if PY_VERSION_RE.match(version):
|
||||||
# upload-wheels.sh ensures no "dev" is in args.version
|
# upload-wheels.sh ensures no "dev" is in args.version
|
||||||
@@ -436,25 +384,9 @@ if __name__ == "__main__":
|
|||||||
print("Nightly version detected, keeping all wheel files.")
|
print("Nightly version detected, keeping all wheel files.")
|
||||||
|
|
||||||
# Generate index and metadata, assuming wheels and indices are stored as:
|
# Generate index and metadata, assuming wheels and indices are stored as:
|
||||||
# s3://vllm-wheels/{wheel_dir}/<wheel files>
|
# s3://vllm-wheels/{version}/<wheel files>
|
||||||
# s3://vllm-wheels/<anything>/<index files>
|
# s3://vllm-wheels/<anything>/<index files>
|
||||||
#
|
wheel_base_dir = Path(output_dir).parent / version
|
||||||
# For ROCm builds, version is "rocm/{commit}" and indices are uploaded to:
|
|
||||||
# - rocm/{commit}/ (same as wheels)
|
|
||||||
# - rocm/nightly/
|
|
||||||
# - rocm/{version}/
|
|
||||||
# All these are under the "rocm/" prefix, so relative paths should be
|
|
||||||
# relative to "rocm/", not the bucket root.
|
|
||||||
if args.wheel_dir:
|
|
||||||
# Explicit wheel-dir provided (e.g., for version-specific indices pointing to commit dir)
|
|
||||||
wheel_dir = args.wheel_dir.strip().rstrip("/")
|
|
||||||
elif version.startswith("rocm/"):
|
|
||||||
# For rocm/commit, wheel_base_dir should be just the commit part
|
|
||||||
# so relative path from rocm/0.12.0/rocm710/vllm/ -> ../../../{commit}/
|
|
||||||
wheel_dir = version.split("/", 1)[1]
|
|
||||||
else:
|
|
||||||
wheel_dir = version
|
|
||||||
wheel_base_dir = Path(output_dir).parent / wheel_dir
|
|
||||||
index_base_dir = Path(output_dir)
|
index_base_dir = Path(output_dir)
|
||||||
|
|
||||||
generate_index_and_metadata(
|
generate_index_and_metadata(
|
||||||
|
|||||||
@@ -141,6 +141,7 @@ if [[ $commands == *" entrypoints/openai "* ]]; then
|
|||||||
--ignore=entrypoints/openai/test_audio.py \
|
--ignore=entrypoints/openai/test_audio.py \
|
||||||
--ignore=entrypoints/openai/test_shutdown.py \
|
--ignore=entrypoints/openai/test_shutdown.py \
|
||||||
--ignore=entrypoints/openai/test_completion.py \
|
--ignore=entrypoints/openai/test_completion.py \
|
||||||
|
--ignore=entrypoints/openai/test_sleep.py \
|
||||||
--ignore=entrypoints/openai/test_models.py \
|
--ignore=entrypoints/openai/test_models.py \
|
||||||
--ignore=entrypoints/openai/test_lora_adapters.py \
|
--ignore=entrypoints/openai/test_lora_adapters.py \
|
||||||
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
|
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
|
||||||
@@ -209,21 +210,12 @@ if [[ $commands == *"--shard-id="* ]]; then
|
|||||||
wait "${pid}"
|
wait "${pid}"
|
||||||
STATUS+=($?)
|
STATUS+=($?)
|
||||||
done
|
done
|
||||||
at_least_one_shard_with_tests=0
|
|
||||||
for st in "${STATUS[@]}"; do
|
for st in "${STATUS[@]}"; do
|
||||||
if [[ ${st} -ne 0 ]] && [[ ${st} -ne 5 ]]; then
|
if [[ ${st} -ne 0 ]]; then
|
||||||
echo "One of the processes failed with $st"
|
echo "One of the processes failed with $st"
|
||||||
exit "${st}"
|
exit "${st}"
|
||||||
elif [[ ${st} -eq 5 ]]; then
|
|
||||||
echo "Shard exited with status 5 (no tests collected) - treating as success"
|
|
||||||
else # This means st is 0
|
|
||||||
at_least_one_shard_with_tests=1
|
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
if [[ ${#STATUS[@]} -gt 0 && ${at_least_one_shard_with_tests} -eq 0 ]]; then
|
|
||||||
echo "All shards reported no tests collected. Failing the build."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
else
|
else
|
||||||
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
||||||
docker run \
|
docker run \
|
||||||
|
|||||||
@@ -50,7 +50,6 @@ function cpu_tests() {
|
|||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
|
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
|
||||||
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
|
|
||||||
pytest -x -v -s tests/kernels/test_onednn.py"
|
pytest -x -v -s tests/kernels/test_onednn.py"
|
||||||
|
|
||||||
# Run basic model test
|
# Run basic model test
|
||||||
@@ -84,7 +83,7 @@ function cpu_tests() {
|
|||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -x -s -v \
|
pytest -x -s -v \
|
||||||
tests/lora/test_qwenvl.py"
|
tests/lora/test_qwen2vl.py"
|
||||||
|
|
||||||
# online serving: tp+pp
|
# online serving: tp+pp
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c '
|
docker exec cpu-test-"$NUMA_NODE" bash -c '
|
||||||
|
|||||||
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
|
|||||||
echo "--- Installing Python dependencies ---"
|
echo "--- Installing Python dependencies ---"
|
||||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||||
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
|
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
||||||
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
||||||
echo "--- Python dependencies installed ---"
|
echo "--- Python dependencies installed ---"
|
||||||
|
|
||||||
|
|||||||
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
|
|||||||
echo "--- Installing Python dependencies ---"
|
echo "--- Installing Python dependencies ---"
|
||||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||||
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
|
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
||||||
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
||||||
echo "--- Python dependencies installed ---"
|
echo "--- Python dependencies installed ---"
|
||||||
|
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ docker run \
|
|||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||||
python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
|
python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
|
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||||
cd tests
|
cd tests
|
||||||
pytest -v -s v1/core
|
pytest -v -s v1/core
|
||||||
pytest -v -s v1/engine
|
pytest -v -s v1/engine
|
||||||
|
|||||||
@@ -1,36 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
# Get tag variant from argument, default to empty if not provided, should be something like "cu130".
|
|
||||||
# Due to limits in cleanup script, we must move variants to use separate tags like "cu130-nightly",
|
|
||||||
# otherwise they will be cleaned up together with the main "nightly" tags.
|
|
||||||
|
|
||||||
TAG_VARIANT="$1"
|
|
||||||
if [ -n "$TAG_VARIANT" ]; then
|
|
||||||
ORIG_TAG_SUFFIX="-$TAG_VARIANT"
|
|
||||||
TAG_NAME="$TAG_VARIANT-nightly"
|
|
||||||
else
|
|
||||||
ORIG_TAG_SUFFIX=""
|
|
||||||
TAG_NAME="nightly"
|
|
||||||
fi
|
|
||||||
|
|
||||||
ORIG_TAG_NAME="$BUILDKITE_COMMIT"
|
|
||||||
|
|
||||||
echo "Pushing original tag $ORIG_TAG_NAME$ORIG_TAG_SUFFIX to new nightly tag name: $TAG_NAME"
|
|
||||||
|
|
||||||
# pull original arch-dependent images from AWS ECR Public
|
|
||||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
|
|
||||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX
|
|
||||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX
|
|
||||||
# tag arch-dependent images
|
|
||||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-x86_64
|
|
||||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-aarch64
|
|
||||||
# push arch-dependent images to DockerHub
|
|
||||||
docker push vllm/vllm-openai:$TAG_NAME-x86_64
|
|
||||||
docker push vllm/vllm-openai:$TAG_NAME-aarch64
|
|
||||||
# push arch-independent manifest to DockerHub
|
|
||||||
docker manifest create vllm/vllm-openai:$TAG_NAME vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
|
|
||||||
docker manifest create vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
|
|
||||||
docker manifest push vllm/vllm-openai:$TAG_NAME
|
|
||||||
docker manifest push vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT
|
|
||||||
@@ -2,17 +2,6 @@
|
|||||||
|
|
||||||
set -euox pipefail
|
set -euox pipefail
|
||||||
|
|
||||||
# To detect ROCm
|
|
||||||
# Check multiple indicators:
|
|
||||||
if [ -e /dev/kfd ] || \
|
|
||||||
[ -d /opt/rocm ] || \
|
|
||||||
command -v rocm-smi &> /dev/null || \
|
|
||||||
[ -n "${ROCM_HOME:-}" ]; then
|
|
||||||
IS_ROCM=1
|
|
||||||
else
|
|
||||||
IS_ROCM=0
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $# -lt 4 ]]; then
|
if [[ $# -lt 4 ]]; then
|
||||||
echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
|
echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
|
||||||
exit 1
|
exit 1
|
||||||
@@ -37,18 +26,13 @@ for command in "${COMMANDS[@]}"; do
|
|||||||
echo "$command"
|
echo "$command"
|
||||||
done
|
done
|
||||||
|
|
||||||
|
|
||||||
start_network() {
|
start_network() {
|
||||||
docker network create --subnet=192.168.10.0/24 docker-net
|
docker network create --subnet=192.168.10.0/24 docker-net
|
||||||
}
|
}
|
||||||
|
|
||||||
start_nodes() {
|
start_nodes() {
|
||||||
for node in $(seq 0 $(($NUM_NODES-1))); do
|
for node in $(seq 0 $(($NUM_NODES-1))); do
|
||||||
if [ "$IS_ROCM" -eq 1 ]; then
|
GPU_DEVICES='"device='
|
||||||
GPU_DEVICES='--device /dev/kfd --device /dev/dri -e HIP_VISIBLE_DEVICES='
|
|
||||||
else
|
|
||||||
GPU_DEVICES='--gpus "device='
|
|
||||||
fi
|
|
||||||
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
|
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
|
||||||
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
|
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
|
||||||
GPU_DEVICES+=$(($DEVICE_NUM))
|
GPU_DEVICES+=$(($DEVICE_NUM))
|
||||||
@@ -56,9 +40,7 @@ start_nodes() {
|
|||||||
GPU_DEVICES+=','
|
GPU_DEVICES+=','
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
if [ "$IS_ROCM" -eq 0 ]; then
|
GPU_DEVICES+='"'
|
||||||
GPU_DEVICES+='"'
|
|
||||||
fi
|
|
||||||
|
|
||||||
# start the container in detached mode
|
# start the container in detached mode
|
||||||
# things to note:
|
# things to note:
|
||||||
@@ -67,7 +49,7 @@ start_nodes() {
|
|||||||
# 3. map the huggingface cache directory to the container
|
# 3. map the huggingface cache directory to the container
|
||||||
# 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
|
# 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
|
||||||
# starting from 192.168.10.11)
|
# starting from 192.168.10.11)
|
||||||
docker run -d $GPU_DEVICES --shm-size=10.24gb -e HF_TOKEN \
|
docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN \
|
||||||
-v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \
|
-v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \
|
||||||
--network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \
|
--network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \
|
||||||
/bin/bash -c "tail -f /dev/null"
|
/bin/bash -c "tail -f /dev/null"
|
||||||
|
|||||||
@@ -44,10 +44,10 @@ trap cleanup EXIT
|
|||||||
|
|
||||||
for BACK in "${BACKENDS[@]}"; do
|
for BACK in "${BACKENDS[@]}"; do
|
||||||
VLLM_DEEP_GEMM_WARMUP=skip \
|
VLLM_DEEP_GEMM_WARMUP=skip \
|
||||||
|
VLLM_ALL2ALL_BACKEND=$BACK \
|
||||||
vllm serve "$MODEL" \
|
vllm serve "$MODEL" \
|
||||||
--enforce-eager \
|
--enforce-eager \
|
||||||
--enable-eplb \
|
--enable-eplb \
|
||||||
--all2all-backend $BACK \
|
|
||||||
--eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
|
--eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
|
||||||
--tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
|
--tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
|
||||||
--data-parallel-size ${DATA_PARALLEL_SIZE} \
|
--data-parallel-size ${DATA_PARALLEL_SIZE} \
|
||||||
|
|||||||
@@ -43,12 +43,12 @@ trap cleanup EXIT
|
|||||||
|
|
||||||
for BACK in "${BACKENDS[@]}"; do
|
for BACK in "${BACKENDS[@]}"; do
|
||||||
VLLM_DEEP_GEMM_WARMUP=skip \
|
VLLM_DEEP_GEMM_WARMUP=skip \
|
||||||
|
VLLM_ALL2ALL_BACKEND=$BACK \
|
||||||
vllm serve "$MODEL" \
|
vllm serve "$MODEL" \
|
||||||
--enforce-eager \
|
--enforce-eager \
|
||||||
--tensor-parallel-size 4 \
|
--tensor-parallel-size 4 \
|
||||||
--enable-expert-parallel \
|
--enable-expert-parallel \
|
||||||
--enable-eplb \
|
--enable-eplb \
|
||||||
--all2all-backend $BACK \
|
|
||||||
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
|
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
|
||||||
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
|
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
|
|||||||
@@ -1,104 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
BUCKET="vllm-wheels"
|
|
||||||
SUBPATH=$BUILDKITE_COMMIT
|
|
||||||
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
|
|
||||||
|
|
||||||
RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
|
|
||||||
echo "Release version from Buildkite: $RELEASE_VERSION"
|
|
||||||
GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)
|
|
||||||
if [ -z "$GIT_VERSION" ]; then
|
|
||||||
echo "[FATAL] Not on a git tag, cannot create release."
|
|
||||||
exit 1
|
|
||||||
else
|
|
||||||
echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
|
|
||||||
fi
|
|
||||||
# sanity check for version mismatch
|
|
||||||
if [ "$RELEASE_VERSION" != "$GIT_VERSION" ]; then
|
|
||||||
if [ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]; then
|
|
||||||
echo "[WARNING] Force release and ignore version mismatch"
|
|
||||||
else
|
|
||||||
echo "[FATAL] Release version from Buildkite does not match Git version."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
PURE_VERSION=${RELEASE_VERSION#v} # remove leading 'v'
|
|
||||||
|
|
||||||
# check pypi token
|
|
||||||
if [ -z "$PYPI_TOKEN" ]; then
|
|
||||||
echo "[FATAL] PYPI_TOKEN is not set."
|
|
||||||
exit 1
|
|
||||||
else
|
|
||||||
export TWINE_USERNAME="__token__"
|
|
||||||
export TWINE_PASSWORD="$PYPI_TOKEN"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# check github token
|
|
||||||
if [ -z "$GITHUB_TOKEN" ]; then
|
|
||||||
echo "[FATAL] GITHUB_TOKEN is not set."
|
|
||||||
exit 1
|
|
||||||
else
|
|
||||||
export GH_TOKEN="$GITHUB_TOKEN"
|
|
||||||
fi
|
|
||||||
|
|
||||||
set -x # avoid printing secrets above
|
|
||||||
|
|
||||||
# download gh CLI from github
|
|
||||||
# Get latest gh CLI version from GitHub API
|
|
||||||
GH_VERSION=$(curl -s https://api.github.com/repos/cli/cli/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/' | sed 's/^v//')
|
|
||||||
if [ -z "$GH_VERSION" ]; then
|
|
||||||
echo "[FATAL] Failed to get latest gh CLI version from GitHub"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "Downloading gh CLI version: $GH_VERSION"
|
|
||||||
GH_TARBALL="gh_${GH_VERSION}_linux_amd64.tar.gz"
|
|
||||||
GH_URL="https://github.com/cli/cli/releases/download/v${GH_VERSION}/${GH_TARBALL}"
|
|
||||||
GH_INSTALL_DIR="/tmp/gh-install"
|
|
||||||
mkdir -p "$GH_INSTALL_DIR"
|
|
||||||
pushd "$GH_INSTALL_DIR"
|
|
||||||
curl -L -o "$GH_TARBALL" "$GH_URL"
|
|
||||||
tar -xzf "$GH_TARBALL"
|
|
||||||
GH_BIN=$(realpath $(find . -name "gh" -type f -executable | head -n 1))
|
|
||||||
if [ -z "$GH_BIN" ]; then
|
|
||||||
echo "[FATAL] Failed to find gh CLI executable"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "gh CLI downloaded successfully, version: $($GH_BIN --version)"
|
|
||||||
echo "Last 5 releases on GitHub:" # as a sanity check of gh and GH_TOKEN
|
|
||||||
command "$GH_BIN" release list --limit 5
|
|
||||||
popd
|
|
||||||
|
|
||||||
# install twine from pypi
|
|
||||||
python3 -m venv /tmp/vllm-release-env
|
|
||||||
source /tmp/vllm-release-env/bin/activate
|
|
||||||
pip install twine
|
|
||||||
python3 -m twine --version
|
|
||||||
|
|
||||||
# copy release wheels to local directory
|
|
||||||
DIST_DIR=/tmp/vllm-release-dist
|
|
||||||
echo "Existing wheels on S3:"
|
|
||||||
aws s3 ls "$S3_COMMIT_PREFIX"
|
|
||||||
echo "Copying wheels to local directory"
|
|
||||||
mkdir -p $DIST_DIR
|
|
||||||
# include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
|
|
||||||
aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
|
|
||||||
echo "Wheels copied to local directory"
|
|
||||||
# generate source tarball
|
|
||||||
git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" $BUILDKITE_COMMIT
|
|
||||||
ls -la $DIST_DIR
|
|
||||||
|
|
||||||
|
|
||||||
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
|
|
||||||
PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
|
|
||||||
if [ -z "$PYPI_WHEEL_FILES" ]; then
|
|
||||||
echo "No default variant wheels found, quitting..."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
python3 -m twine check $PYPI_WHEEL_FILES
|
|
||||||
python3 -m twine --non-interactive --verbose upload $PYPI_WHEEL_FILES
|
|
||||||
echo "Wheels uploaded to PyPI"
|
|
||||||
|
|
||||||
# create release on GitHub with the release version and all wheels
|
|
||||||
command "$GH_BIN" release create $GIT_VERSION -d --latest --notes-from-tag --verify-tag $DIST_DIR/*.whl
|
|
||||||
@@ -1,151 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
#
|
|
||||||
# Upload ROCm wheels to S3 with proper index generation
|
|
||||||
#
|
|
||||||
# Required environment variables:
|
|
||||||
# AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY (or IAM role)
|
|
||||||
# S3_BUCKET (default: vllm-wheels)
|
|
||||||
#
|
|
||||||
# S3 path structure:
|
|
||||||
# s3://vllm-wheels/rocm/{commit}/ - All wheels for this commit
|
|
||||||
# s3://vllm-wheels/rocm/nightly/ - Index pointing to latest nightly
|
|
||||||
# s3://vllm-wheels/rocm/{version}/ - Index for release versions
|
|
||||||
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
# ======== Configuration ========
|
|
||||||
BUCKET="${S3_BUCKET:-vllm-wheels}"
|
|
||||||
ROCM_SUBPATH="rocm/${BUILDKITE_COMMIT}"
|
|
||||||
S3_COMMIT_PREFIX="s3://$BUCKET/$ROCM_SUBPATH/"
|
|
||||||
INDICES_OUTPUT_DIR="rocm-indices"
|
|
||||||
PYTHON="${PYTHON_PROG:-python3}"
|
|
||||||
|
|
||||||
# ROCm uses manylinux_2_35 (Ubuntu 22.04 based)
|
|
||||||
MANYLINUX_VERSION="manylinux_2_35"
|
|
||||||
|
|
||||||
echo "========================================"
|
|
||||||
echo "ROCm Wheel Upload Configuration"
|
|
||||||
echo "========================================"
|
|
||||||
echo "S3 Bucket: $BUCKET"
|
|
||||||
echo "S3 Path: $ROCM_SUBPATH"
|
|
||||||
echo "Commit: $BUILDKITE_COMMIT"
|
|
||||||
echo "Branch: $BUILDKITE_BRANCH"
|
|
||||||
echo "========================================"
|
|
||||||
|
|
||||||
# ======== Part 0: Setup Python ========
|
|
||||||
|
|
||||||
# Detect if python3.12+ is available
|
|
||||||
has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)" 2>/dev/null || echo 0)
|
|
||||||
if [[ "$has_new_python" -eq 0 ]]; then
|
|
||||||
# Use new python from docker
|
|
||||||
# Use --user to ensure files are created with correct ownership (not root)
|
|
||||||
docker pull python:3-slim
|
|
||||||
PYTHON="docker run --rm --user $(id -u):$(id -g) -v $(pwd):/app -w /app python:3-slim python3"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Using python interpreter: $PYTHON"
|
|
||||||
echo "Python version: $($PYTHON --version)"
|
|
||||||
|
|
||||||
# ======== Part 1: Collect and prepare wheels ========
|
|
||||||
|
|
||||||
# Collect all wheels
|
|
||||||
mkdir -p all-rocm-wheels
|
|
||||||
cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true
|
|
||||||
cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true
|
|
||||||
|
|
||||||
WHEEL_COUNT=$(ls all-rocm-wheels/*.whl 2>/dev/null | wc -l)
|
|
||||||
echo "Total wheels to upload: $WHEEL_COUNT"
|
|
||||||
|
|
||||||
if [ "$WHEEL_COUNT" -eq 0 ]; then
|
|
||||||
echo "ERROR: No wheels found to upload!"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Rename linux to manylinux in wheel filenames
|
|
||||||
for wheel in all-rocm-wheels/*.whl; do
|
|
||||||
if [[ "$wheel" == *"linux"* ]] && [[ "$wheel" != *"manylinux"* ]]; then
|
|
||||||
new_wheel="${wheel/linux/$MANYLINUX_VERSION}"
|
|
||||||
mv -- "$wheel" "$new_wheel"
|
|
||||||
echo "Renamed: $(basename "$wheel") -> $(basename "$new_wheel")"
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "Wheels to upload:"
|
|
||||||
ls -lh all-rocm-wheels/
|
|
||||||
|
|
||||||
# ======== Part 2: Upload wheels to S3 ========
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "Uploading wheels to $S3_COMMIT_PREFIX"
|
|
||||||
for wheel in all-rocm-wheels/*.whl; do
|
|
||||||
aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
|
|
||||||
done
|
|
||||||
|
|
||||||
# ======== Part 3: Generate and upload indices ========
|
|
||||||
|
|
||||||
# List existing wheels in commit directory
|
|
||||||
echo ""
|
|
||||||
echo "Generating indices..."
|
|
||||||
obj_json="rocm-objects.json"
|
|
||||||
aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$ROCM_SUBPATH/" --delimiter / --output json > "$obj_json"
|
|
||||||
|
|
||||||
mkdir -p "$INDICES_OUTPUT_DIR"
|
|
||||||
|
|
||||||
# Use the existing generate-nightly-index.py
|
|
||||||
# HACK: Replace regex module with stdlib re (same as CUDA script)
|
|
||||||
sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
|
|
||||||
|
|
||||||
$PYTHON .buildkite/scripts/generate-nightly-index.py \
|
|
||||||
--version "$ROCM_SUBPATH" \
|
|
||||||
--current-objects "$obj_json" \
|
|
||||||
--output-dir "$INDICES_OUTPUT_DIR" \
|
|
||||||
--comment "ROCm commit $BUILDKITE_COMMIT"
|
|
||||||
|
|
||||||
# Upload indices to commit directory
|
|
||||||
echo "Uploading indices to $S3_COMMIT_PREFIX"
|
|
||||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
|
|
||||||
|
|
||||||
# Update rocm/nightly/ if on main branch and not a PR
|
|
||||||
if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]] || [[ "$NIGHTLY" == "1" ]]; then
|
|
||||||
echo "Updating rocm/nightly/ index..."
|
|
||||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/rocm/nightly/"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Extract version from vLLM wheel and update version-specific index
|
|
||||||
VLLM_WHEEL=$(ls all-rocm-wheels/vllm*.whl 2>/dev/null | head -1)
|
|
||||||
if [ -n "$VLLM_WHEEL" ]; then
|
|
||||||
VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
|
||||||
echo "Version in wheel: $VERSION"
|
|
||||||
PURE_VERSION="${VERSION%%+*}"
|
|
||||||
PURE_VERSION="${PURE_VERSION%%.rocm}"
|
|
||||||
echo "Pure version: $PURE_VERSION"
|
|
||||||
|
|
||||||
if [[ "$VERSION" != *"dev"* ]]; then
|
|
||||||
echo "Updating rocm/$PURE_VERSION/ index..."
|
|
||||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/rocm/$PURE_VERSION/"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
# ======== Part 4: Summary ========
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "========================================"
|
|
||||||
echo "ROCm Wheel Upload Complete!"
|
|
||||||
echo "========================================"
|
|
||||||
echo ""
|
|
||||||
echo "Wheels available at:"
|
|
||||||
echo " s3://$BUCKET/$ROCM_SUBPATH/"
|
|
||||||
echo ""
|
|
||||||
echo "Install command (by commit):"
|
|
||||||
echo " pip install vllm --extra-index-url https://${BUCKET}.s3.amazonaws.com/$ROCM_SUBPATH/"
|
|
||||||
echo ""
|
|
||||||
if [[ "$BUILDKITE_BRANCH" == "main" ]] || [[ "$NIGHTLY" == "1" ]]; then
|
|
||||||
echo "Install command (nightly):"
|
|
||||||
echo " pip install vllm --extra-index-url https://${BUCKET}.s3.amazonaws.com/rocm/nightly/"
|
|
||||||
fi
|
|
||||||
echo ""
|
|
||||||
echo "Wheel count: $WHEEL_COUNT"
|
|
||||||
echo "========================================"
|
|
||||||
@@ -102,7 +102,6 @@ if [[ "$version" != *"dev"* ]]; then
|
|||||||
echo "Re-generating indices for /$pure_version/"
|
echo "Re-generating indices for /$pure_version/"
|
||||||
rm -rf "$INDICES_OUTPUT_DIR/*"
|
rm -rf "$INDICES_OUTPUT_DIR/*"
|
||||||
mkdir -p "$INDICES_OUTPUT_DIR"
|
mkdir -p "$INDICES_OUTPUT_DIR"
|
||||||
# wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path
|
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
|
||||||
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
|
|
||||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
|
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
|
||||||
fi
|
fi
|
||||||
@@ -128,7 +128,7 @@ steps:
|
|||||||
- tests/entrypoints/
|
- tests/entrypoints/
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s entrypoints/openai/tool_parsers
|
- pytest -v -s entrypoints/openai/tool_parsers
|
||||||
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
|
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
|
||||||
|
|
||||||
- label: Entrypoints Integration Test (LLM) # 30min
|
- label: Entrypoints Integration Test (LLM) # 30min
|
||||||
timeout_in_minutes: 40
|
timeout_in_minutes: 40
|
||||||
@@ -148,7 +148,7 @@ steps:
|
|||||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||||
|
|
||||||
- label: Entrypoints Integration Test (API Server 1) # 100min
|
- label: Entrypoints Integration Test (API Server) # 100min
|
||||||
timeout_in_minutes: 130
|
timeout_in_minutes: 130
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
@@ -162,28 +162,10 @@ steps:
|
|||||||
- tests/entrypoints/test_chat_utils
|
- tests/entrypoints/test_chat_utils
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
|
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
|
||||||
|
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
|
||||||
- pytest -v -s entrypoints/test_chat_utils.py
|
- pytest -v -s entrypoints/test_chat_utils.py
|
||||||
|
|
||||||
- label: Entrypoints Integration Test (API Server 2)
|
|
||||||
timeout_in_minutes: 50
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
agent_pool: mi325_1
|
|
||||||
# grade: Blocking
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
fast_check: true
|
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/entrypoints/sleep
|
|
||||||
- tests/entrypoints/rpc
|
|
||||||
- tests/tool_use
|
|
||||||
commands:
|
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
- pytest -v -s entrypoints/sleep
|
|
||||||
- pytest -v -s tool_use
|
|
||||||
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
|
|
||||||
|
|
||||||
- label: Entrypoints Integration Test (Pooling)
|
- label: Entrypoints Integration Test (Pooling)
|
||||||
timeout_in_minutes: 50
|
timeout_in_minutes: 50
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
@@ -199,21 +181,6 @@ steps:
|
|||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s entrypoints/pooling
|
- pytest -v -s entrypoints/pooling
|
||||||
|
|
||||||
- label: Entrypoints Integration Test (Responses API)
|
|
||||||
timeout_in_minutes: 50
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
agent_pool: mi325_1
|
|
||||||
# grade: Blocking
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
fast_check: true
|
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/entrypoints/openai/responses
|
|
||||||
commands:
|
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
- pytest -v -s entrypoints/openai/responses
|
|
||||||
|
|
||||||
- label: Distributed Tests (4 GPUs) # 35min
|
- label: Distributed Tests (4 GPUs) # 35min
|
||||||
timeout_in_minutes: 50
|
timeout_in_minutes: 50
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
@@ -234,9 +201,6 @@ steps:
|
|||||||
- tests/v1/engine/test_engine_core_client.py
|
- tests/v1/engine/test_engine_core_client.py
|
||||||
- tests/distributed/test_symm_mem_allreduce.py
|
- tests/distributed/test_symm_mem_allreduce.py
|
||||||
commands:
|
commands:
|
||||||
# Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
|
|
||||||
# TODO: Remove when the bug is fixed in a future ROCm release
|
|
||||||
- export TORCH_NCCL_BLOCKING_WAIT=1
|
|
||||||
# test with torchrun tp=2 and external_dp=2
|
# test with torchrun tp=2 and external_dp=2
|
||||||
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||||
# test with torchrun tp=2 and pp=2
|
# test with torchrun tp=2 and pp=2
|
||||||
@@ -285,10 +249,9 @@ steps:
|
|||||||
- vllm/v1/executor/uniproc_executor.py
|
- vllm/v1/executor/uniproc_executor.py
|
||||||
- vllm/v1/worker/gpu_worker.py
|
- vllm/v1/worker/gpu_worker.py
|
||||||
commands:
|
commands:
|
||||||
|
# https://github.com/NVIDIA/nccl/issues/1838
|
||||||
|
#- export NCCL_CUMEM_HOST_ENABLE=0
|
||||||
# test with torchrun tp=2 and dp=4 with ep
|
# test with torchrun tp=2 and dp=4 with ep
|
||||||
# Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
|
|
||||||
# TODO: Remove when the bug is fixed in a future ROCm release
|
|
||||||
- export TORCH_NCCL_BLOCKING_WAIT=1
|
|
||||||
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
|
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
|
||||||
|
|
||||||
- label: EPLB Algorithm Test # 5min
|
- label: EPLB Algorithm Test # 5min
|
||||||
@@ -368,9 +331,7 @@ steps:
|
|||||||
- label: V1 Test e2e + engine # 65min
|
- label: V1 Test e2e + engine # 65min
|
||||||
timeout_in_minutes: 90
|
timeout_in_minutes: 90
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
# The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability.
|
agent_pool: mi325_4
|
||||||
# See discussion here: https://github.com/vllm-project/vllm/pull/31040
|
|
||||||
agent_pool: mi325_8
|
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
@@ -531,7 +492,8 @@ steps:
|
|||||||
- tests/samplers
|
- tests/samplers
|
||||||
- tests/conftest.py
|
- tests/conftest.py
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s -m 'not skip_v1' samplers
|
- pytest -v -s samplers
|
||||||
|
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
||||||
|
|
||||||
- label: LoRA Test %N # 20min each
|
- label: LoRA Test %N # 20min each
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
@@ -745,7 +707,7 @@ steps:
|
|||||||
|
|
||||||
- label: Quantization Test # 70min
|
- label: Quantization Test # 70min
|
||||||
timeout_in_minutes: 90
|
timeout_in_minutes: 90
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@@ -760,7 +722,7 @@ steps:
|
|||||||
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
||||||
# we can only upgrade after this is resolved
|
# we can only upgrade after this is resolved
|
||||||
# TODO(jerryzh168): resolve the above comment
|
# TODO(jerryzh168): resolve the above comment
|
||||||
- uv pip install --system torchao==0.14.1
|
- uv pip install --system torchao==0.13.0
|
||||||
- uv pip install --system conch-triton-kernels
|
- uv pip install --system conch-triton-kernels
|
||||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||||
|
|
||||||
@@ -774,7 +736,7 @@ steps:
|
|||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
autorun_on_main: true
|
autorun_on_main: true
|
||||||
commands:
|
commands:
|
||||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||||
|
|
||||||
- label: OpenAI API correctness # 10min
|
- label: OpenAI API correctness # 10min
|
||||||
timeout_in_minutes: 15
|
timeout_in_minutes: 15
|
||||||
@@ -785,11 +747,21 @@ steps:
|
|||||||
- csrc/
|
- csrc/
|
||||||
- vllm/entrypoints/openai/
|
- vllm/entrypoints/openai/
|
||||||
- vllm/model_executor/models/whisper.py
|
- vllm/model_executor/models/whisper.py
|
||||||
- tools/
|
|
||||||
commands: # LMEval+Transcription WER check
|
commands: # LMEval+Transcription WER check
|
||||||
- bash ../tools/install_torchcodec_rocm.sh || exit 1
|
# Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
|
||||||
- pytest -s entrypoints/openai/correctness/
|
- pytest -s entrypoints/openai/correctness/
|
||||||
|
|
||||||
|
- label: OpenAI-Compatible Tool Use # 23 min
|
||||||
|
timeout_in_minutes: 35
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
agent_pool: mi325_1
|
||||||
|
# grade: Blocking
|
||||||
|
fast_check: false
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/tool_use
|
||||||
|
commands:
|
||||||
|
- pytest -v -s tool_use
|
||||||
|
|
||||||
##### models test #####
|
##### models test #####
|
||||||
|
|
||||||
@@ -882,7 +854,6 @@ steps:
|
|||||||
# Shard slow subset of standard language models tests. Only run when model
|
# Shard slow subset of standard language models tests. Only run when model
|
||||||
# source is modified, or when specified test files are modified
|
# source is modified, or when specified test files are modified
|
||||||
- pip freeze | grep -E 'torch'
|
- pip freeze | grep -E 'torch'
|
||||||
- export TORCH_NCCL_BLOCKING_WAIT=1
|
|
||||||
- pytest -v -s models/language -m 'core_model and slow_test' \
|
- pytest -v -s models/language -m 'core_model and slow_test' \
|
||||||
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
|
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
|
||||||
--shard-id=$$BUILDKITE_PARALLEL_JOB
|
--shard-id=$$BUILDKITE_PARALLEL_JOB
|
||||||
@@ -900,7 +871,7 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
# Install fast path packages for testing against transformers
|
# Install fast path packages for testing against transformers
|
||||||
# Note: also needed to run plamo2 model in vLLM
|
# Note: also needed to run plamo2 model in vLLM
|
||||||
- uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
|
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
|
||||||
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
||||||
# Shard hybrid language model tests
|
# Shard hybrid language model tests
|
||||||
- pytest -v -s models/language/generation \
|
- pytest -v -s models/language/generation \
|
||||||
@@ -921,7 +892,7 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
# Install fast path packages for testing against transformers
|
# Install fast path packages for testing against transformers
|
||||||
# Note: also needed to run plamo2 model in vLLM
|
# Note: also needed to run plamo2 model in vLLM
|
||||||
- uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
|
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
|
||||||
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
||||||
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
|
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
|
||||||
|
|
||||||
@@ -986,7 +957,7 @@ steps:
|
|||||||
- pytest -v -s models/multimodal/processing
|
- pytest -v -s models/multimodal/processing
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Standard) # 60min
|
- label: Multi-Modal Models Test (Standard) # 60min
|
||||||
timeout_in_minutes: 100
|
timeout_in_minutes: 80
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
@@ -995,16 +966,13 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/multimodal
|
- tests/models/multimodal
|
||||||
commands:
|
commands:
|
||||||
- export MIOPEN_DEBUG_CONV_DIRECT=0
|
|
||||||
- export MIOPEN_DEBUG_CONV_GEMM=0
|
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
- pip freeze | grep -E 'torch'
|
- pip freeze | grep -E 'torch'
|
||||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing --ignore models/multimodal/pooling/test_prithvi_mae.py
|
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||||
- pytest -v -s models/multimodal/pooling/test_prithvi_mae.py -m core_model
|
|
||||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||||
|
|
||||||
- label: Multi-Modal Accuracy Eval (Small Models) # 5min
|
- label: Multi-Modal Accuracy Eval (Small Models) # 150min - 180min
|
||||||
timeout_in_minutes: 10
|
timeout_in_minutes: 180
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
@@ -1014,9 +982,7 @@ steps:
|
|||||||
- vllm/inputs/
|
- vllm/inputs/
|
||||||
- vllm/v1/core/
|
- vllm/v1/core/
|
||||||
commands:
|
commands:
|
||||||
- export MIOPEN_DEBUG_CONV_DIRECT=0
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
||||||
- export MIOPEN_DEBUG_CONV_GEMM=0
|
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt
|
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 1 # 60min
|
- label: Multi-Modal Models Test (Extended) 1 # 60min
|
||||||
timeout_in_minutes: 120
|
timeout_in_minutes: 120
|
||||||
@@ -1028,13 +994,10 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/multimodal
|
- tests/models/multimodal
|
||||||
commands:
|
commands:
|
||||||
- export MIOPEN_DEBUG_CONV_DIRECT=0
|
|
||||||
- export MIOPEN_DEBUG_CONV_GEMM=0
|
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
- pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
|
- pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 2 #60min
|
- label: Multi-Modal Models Test (Extended) 2
|
||||||
timeout_in_minutes: 120
|
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
@@ -1043,8 +1006,6 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/multimodal
|
- tests/models/multimodal
|
||||||
commands:
|
commands:
|
||||||
- export MIOPEN_DEBUG_CONV_DIRECT=0
|
|
||||||
- export MIOPEN_DEBUG_CONV_GEMM=0
|
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
|
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
|
||||||
|
|
||||||
@@ -1058,8 +1019,6 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/multimodal
|
- tests/models/multimodal
|
||||||
commands:
|
commands:
|
||||||
- export MIOPEN_DEBUG_CONV_DIRECT=0
|
|
||||||
- export MIOPEN_DEBUG_CONV_GEMM=0
|
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
|
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
|
||||||
|
|
||||||
@@ -1119,8 +1078,8 @@ steps:
|
|||||||
- vllm/v1/attention/backends/flashinfer.py
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
- vllm/v1/attention/backends/mla/cutlass_mla.py
|
- vllm/v1/attention/backends/mla/cutlass_mla.py
|
||||||
- vllm/v1/attention/backends/mla/flashinfer_mla.py
|
- vllm/v1/attention/backends/mla/flashinfer_mla.py
|
||||||
- vllm/v1/attention/selector.py
|
|
||||||
- vllm/platforms/cuda.py
|
- vllm/platforms/cuda.py
|
||||||
|
- vllm/attention/selector.py
|
||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
- python3 examples/offline_inference/basic/chat.py
|
||||||
@@ -1237,7 +1196,7 @@ steps:
|
|||||||
- csrc/
|
- csrc/
|
||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
commands:
|
commands:
|
||||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
|
||||||
|
|
||||||
##### 1 GPU test #####
|
##### 1 GPU test #####
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
@@ -1277,13 +1236,13 @@ steps:
|
|||||||
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
|
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
|
||||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
||||||
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
|
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
||||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
|
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
|
||||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
|
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
|
||||||
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
|
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
|
||||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
||||||
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
|
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
||||||
|
|
||||||
- label: Distributed Tests (2 GPUs) # 68min
|
- label: Distributed Tests (2 GPUs) # 68min
|
||||||
timeout_in_minutes: 90
|
timeout_in_minutes: 90
|
||||||
@@ -1309,9 +1268,6 @@ steps:
|
|||||||
- tests/v1/shutdown
|
- tests/v1/shutdown
|
||||||
- tests/v1/worker/test_worker_memory_snapshot.py
|
- tests/v1/worker/test_worker_memory_snapshot.py
|
||||||
commands:
|
commands:
|
||||||
# Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
|
|
||||||
# TODO: Remove when the bug is fixed in a future ROCm release
|
|
||||||
- export TORCH_NCCL_BLOCKING_WAIT=1
|
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||||
@@ -1461,22 +1417,8 @@ steps:
|
|||||||
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||||
- tests/v1/kv_connector/nixl_integration/
|
- tests/v1/kv_connector/nixl_integration/
|
||||||
commands:
|
commands:
|
||||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
|
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||||
- VLLM_ATTENTION_BACKEND=ROCM_ATTN bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
|
||||||
|
|
||||||
- label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
agent_pool: mi325_4
|
|
||||||
# grade: Blocking
|
|
||||||
timeout_in_minutes: 15
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
num_gpus: 4
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
|
||||||
- tests/v1/kv_connector/nixl_integration/
|
|
||||||
commands:
|
|
||||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
|
|
||||||
- VLLM_ATTENTION_BACKEND=ROCM_ATTN DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
|
||||||
|
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
##### A100 test #####
|
##### A100 test #####
|
||||||
@@ -1548,7 +1490,7 @@ steps:
|
|||||||
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
- HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
|
- HIP_VISIBLE_DEVICES=0,1 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||||
|
|
||||||
##### B200 test #####
|
##### B200 test #####
|
||||||
@@ -1572,7 +1514,7 @@ steps:
|
|||||||
- csrc/
|
- csrc/
|
||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
commands:
|
commands:
|
||||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||||
|
|
||||||
- label: LM Eval Large Models (4 Card)
|
- label: LM Eval Large Models (4 Card)
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
@@ -1627,8 +1569,6 @@ steps:
|
|||||||
- .buildkite/scripts/run-prime-rl-test.sh
|
- .buildkite/scripts/run-prime-rl-test.sh
|
||||||
commands:
|
commands:
|
||||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
- bash .buildkite/scripts/run-prime-rl-test.sh
|
||||||
|
|
||||||
##### EPLB Accuracy Tests #####
|
|
||||||
- label: DeepSeek V2-Lite Accuracy
|
- label: DeepSeek V2-Lite Accuracy
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
agent_pool: mi325_4
|
agent_pool: mi325_4
|
||||||
|
|||||||
@@ -114,7 +114,7 @@ steps:
|
|||||||
- tests/entrypoints/
|
- tests/entrypoints/
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s entrypoints/openai/tool_parsers
|
- pytest -v -s entrypoints/openai/tool_parsers
|
||||||
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
|
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
|
||||||
|
|
||||||
- label: Entrypoints Integration Test (LLM) # 30min
|
- label: Entrypoints Integration Test (LLM) # 30min
|
||||||
timeout_in_minutes: 40
|
timeout_in_minutes: 40
|
||||||
@@ -132,7 +132,7 @@ steps:
|
|||||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||||
|
|
||||||
- label: Entrypoints Integration Test (API Server 1) # 100min
|
- label: Entrypoints Integration Test (API Server) # 100min
|
||||||
timeout_in_minutes: 130
|
timeout_in_minutes: 130
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
@@ -144,26 +144,10 @@ steps:
|
|||||||
- tests/entrypoints/test_chat_utils
|
- tests/entrypoints/test_chat_utils
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
|
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
|
||||||
|
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
|
||||||
- pytest -v -s entrypoints/test_chat_utils.py
|
- pytest -v -s entrypoints/test_chat_utils.py
|
||||||
|
|
||||||
- label: Entrypoints Integration Test (API Server 2)
|
|
||||||
timeout_in_minutes: 50
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
fast_check: true
|
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/entrypoints/sleep
|
|
||||||
- tests/entrypoints/rpc
|
|
||||||
- tests/tool_use
|
|
||||||
commands:
|
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
- pytest -v -s entrypoints/sleep
|
|
||||||
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
|
|
||||||
- pytest -v -s tool_use
|
|
||||||
|
|
||||||
- label: Entrypoints Integration Test (Pooling)
|
- label: Entrypoints Integration Test (Pooling)
|
||||||
timeout_in_minutes: 50
|
timeout_in_minutes: 50
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
@@ -177,18 +161,6 @@ steps:
|
|||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s entrypoints/pooling
|
- pytest -v -s entrypoints/pooling
|
||||||
|
|
||||||
- label: Entrypoints Integration Test (Responses API)
|
|
||||||
timeout_in_minutes: 50
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
fast_check: true
|
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/entrypoints/openai/responses
|
|
||||||
commands:
|
|
||||||
- pytest -v -s entrypoints/openai/responses
|
|
||||||
|
|
||||||
- label: Distributed Tests (4 GPUs) # 35min
|
- label: Distributed Tests (4 GPUs) # 35min
|
||||||
timeout_in_minutes: 50
|
timeout_in_minutes: 50
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
@@ -331,10 +303,7 @@ steps:
|
|||||||
# TODO: accuracy does not match, whether setting
|
# TODO: accuracy does not match, whether setting
|
||||||
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
||||||
- pytest -v -s v1/e2e
|
- pytest -v -s v1/e2e
|
||||||
# Run this test standalone for now;
|
- pytest -v -s v1/engine
|
||||||
# need to untangle use (implicit) use of spawn/fork across the tests.
|
|
||||||
- pytest -v -s v1/engine/test_preprocess_error_handling.py
|
|
||||||
- pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
|
|
||||||
|
|
||||||
- label: V1 Test entrypoints # 35min
|
- label: V1 Test entrypoints # 35min
|
||||||
timeout_in_minutes: 50
|
timeout_in_minutes: 50
|
||||||
@@ -673,7 +642,7 @@ steps:
|
|||||||
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
||||||
# we can only upgrade after this is resolved
|
# we can only upgrade after this is resolved
|
||||||
# TODO(jerryzh168): resolve the above comment
|
# TODO(jerryzh168): resolve the above comment
|
||||||
- uv pip install --system torchao==0.14.1 --index-url https://download.pytorch.org/whl/cu129
|
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
|
||||||
- uv pip install --system conch-triton-kernels
|
- uv pip install --system conch-triton-kernels
|
||||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||||
|
|
||||||
@@ -685,7 +654,7 @@ steps:
|
|||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
autorun_on_main: true
|
autorun_on_main: true
|
||||||
commands:
|
commands:
|
||||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||||
|
|
||||||
- label: OpenAI API correctness # 22min
|
- label: OpenAI API correctness # 22min
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
@@ -697,6 +666,16 @@ steps:
|
|||||||
commands: # LMEval+Transcription WER check
|
commands: # LMEval+Transcription WER check
|
||||||
- pytest -s entrypoints/openai/correctness/
|
- pytest -s entrypoints/openai/correctness/
|
||||||
|
|
||||||
|
- label: OpenAI-Compatible Tool Use # 23 min
|
||||||
|
timeout_in_minutes: 35
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
fast_check: false
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/tool_use
|
||||||
|
commands:
|
||||||
|
- pytest -v -s tool_use
|
||||||
|
|
||||||
##### models test #####
|
##### models test #####
|
||||||
|
|
||||||
- label: Basic Models Tests (Initialization)
|
- label: Basic Models Tests (Initialization)
|
||||||
@@ -955,6 +934,7 @@ steps:
|
|||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
gpu: b200
|
gpu: b200
|
||||||
|
# optional: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/quantization/fp4/
|
- csrc/quantization/fp4/
|
||||||
- csrc/attention/mla/
|
- csrc/attention/mla/
|
||||||
@@ -966,8 +946,8 @@ steps:
|
|||||||
- vllm/v1/attention/backends/flashinfer.py
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
- vllm/v1/attention/backends/mla/cutlass_mla.py
|
- vllm/v1/attention/backends/mla/cutlass_mla.py
|
||||||
- vllm/v1/attention/backends/mla/flashinfer_mla.py
|
- vllm/v1/attention/backends/mla/flashinfer_mla.py
|
||||||
- vllm/v1/attention/selector.py
|
|
||||||
- vllm/platforms/cuda.py
|
- vllm/platforms/cuda.py
|
||||||
|
- vllm/attention/selector.py
|
||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
- python3 examples/offline_inference/basic/chat.py
|
||||||
@@ -1084,7 +1064,7 @@ steps:
|
|||||||
- csrc/
|
- csrc/
|
||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
commands:
|
commands:
|
||||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
|
||||||
|
|
||||||
##### 1 GPU test #####
|
##### 1 GPU test #####
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
@@ -1116,18 +1096,17 @@ steps:
|
|||||||
- vllm/model_executor/models/
|
- vllm/model_executor/models/
|
||||||
- tests/distributed/
|
- tests/distributed/
|
||||||
- tests/examples/offline_inference/data_parallel.py
|
- tests/examples/offline_inference/data_parallel.py
|
||||||
- .buildkite/scripts/run-multi-node-test.sh
|
|
||||||
commands:
|
commands:
|
||||||
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
|
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
|
||||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
||||||
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
|
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
||||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
|
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
|
||||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
|
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
|
||||||
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
|
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
|
||||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
||||||
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
|
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
||||||
|
|
||||||
- label: Distributed Tests (2 GPUs) # 68min
|
- label: Distributed Tests (2 GPUs) # 68min
|
||||||
timeout_in_minutes: 90
|
timeout_in_minutes: 90
|
||||||
@@ -1279,8 +1258,8 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
||||||
|
|
||||||
- label: NixlConnector PD accuracy tests (Distributed) # 40min
|
- label: NixlConnector PD accuracy tests (Distributed) # 30min
|
||||||
timeout_in_minutes: 40
|
timeout_in_minutes: 30
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@@ -1288,18 +1267,7 @@ steps:
|
|||||||
- tests/v1/kv_connector/nixl_integration/
|
- tests/v1/kv_connector/nixl_integration/
|
||||||
commands:
|
commands:
|
||||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||||
- bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
|
||||||
|
|
||||||
- label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min
|
|
||||||
timeout_in_minutes: 15
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
num_gpus: 4
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
|
||||||
- tests/v1/kv_connector/nixl_integration/
|
|
||||||
commands:
|
|
||||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
|
||||||
- DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
|
||||||
|
|
||||||
|
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
@@ -1357,17 +1325,9 @@ steps:
|
|||||||
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||||
|
|
||||||
- label: LM Eval Large Models (H200) # optional
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
gpu: h200
|
|
||||||
optional: true
|
|
||||||
num_gpus: 8
|
|
||||||
commands:
|
|
||||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt
|
|
||||||
|
|
||||||
##### B200 test #####
|
##### B200 test #####
|
||||||
- label: Distributed Tests (B200) # optional
|
- label: Distributed Tests (B200) # optional
|
||||||
gpu: b200
|
gpu: b200
|
||||||
@@ -1390,7 +1350,6 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- .buildkite/scripts/run-prime-rl-test.sh
|
- .buildkite/scripts/run-prime-rl-test.sh
|
||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
|
||||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
- bash .buildkite/scripts/run-prime-rl-test.sh
|
||||||
|
|
||||||
- label: DeepSeek V2-Lite Accuracy
|
- label: DeepSeek V2-Lite Accuracy
|
||||||
@@ -1419,26 +1378,3 @@ steps:
|
|||||||
working_dir: "/vllm-workspace"
|
working_dir: "/vllm-workspace"
|
||||||
commands:
|
commands:
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||||
|
|
||||||
##### MoE Refactor (Temporary) Tests #####
|
|
||||||
|
|
||||||
- label: MoE Refactor Integration Test (H100 - TEMPORARY) # optional
|
|
||||||
gpu: h100
|
|
||||||
optional: true
|
|
||||||
num_gpus: 2
|
|
||||||
commands:
|
|
||||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt
|
|
||||||
|
|
||||||
- label: MoE Refactor Integration Test (B200 - TEMPORARY) # optional
|
|
||||||
gpu: b200
|
|
||||||
optional: true
|
|
||||||
num_gpus: 2
|
|
||||||
commands:
|
|
||||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt
|
|
||||||
|
|
||||||
- label: MoE Refactor Integration Test (B200 DP - TEMPORARY) # optional
|
|
||||||
gpu: b200
|
|
||||||
optional: true
|
|
||||||
num_gpus: 2
|
|
||||||
commands:
|
|
||||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
|
|
||||||
|
|||||||
@@ -145,7 +145,7 @@ steps:
|
|||||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
|
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
|
||||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||||
|
|
||||||
- label: Distributed Tests (2 GPUs)(B200)
|
- label: Distributed Tests (2 GPUs)(B200)
|
||||||
@@ -171,7 +171,7 @@ steps:
|
|||||||
- tests/distributed/
|
- tests/distributed/
|
||||||
- tests/examples/offline_inference/data_parallel.py
|
- tests/examples/offline_inference/data_parallel.py
|
||||||
commands:
|
commands:
|
||||||
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
|
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
|
||||||
|
|
||||||
- label: Distributed NixlConnector PD accuracy (4 GPUs)
|
- label: Distributed NixlConnector PD accuracy (4 GPUs)
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
@@ -182,7 +182,7 @@ steps:
|
|||||||
- tests/v1/kv_connector/nixl_integration/
|
- tests/v1/kv_connector/nixl_integration/
|
||||||
commands:
|
commands:
|
||||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||||
- bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
|
||||||
|
|
||||||
- label: Pipeline + Context Parallelism (4 GPUs))
|
- label: Pipeline + Context Parallelism (4 GPUs))
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
|
|||||||
@@ -32,7 +32,6 @@ steps:
|
|||||||
- label: Prime-RL Integration (2 GPUs)
|
- label: Prime-RL Integration (2 GPUs)
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
optional: true
|
optional: true
|
||||||
soft_fail: true
|
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
working_dir: "/vllm-workspace"
|
working_dir: "/vllm-workspace"
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@@ -40,3 +39,21 @@ steps:
|
|||||||
- .buildkite/scripts/run-prime-rl-test.sh
|
- .buildkite/scripts/run-prime-rl-test.sh
|
||||||
commands:
|
commands:
|
||||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
- bash .buildkite/scripts/run-prime-rl-test.sh
|
||||||
|
|
||||||
|
- label: DeepSeek V2-Lite Async EPLB Accuracy
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
|
||||||
|
|
||||||
|
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ steps:
|
|||||||
- tests/entrypoints/
|
- tests/entrypoints/
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s entrypoints/openai/tool_parsers
|
- pytest -v -s entrypoints/openai/tool_parsers
|
||||||
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
|
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
|
||||||
|
|
||||||
- label: Entrypoints Integration (LLM)
|
- label: Entrypoints Integration (LLM)
|
||||||
timeout_in_minutes: 40
|
timeout_in_minutes: 40
|
||||||
@@ -25,7 +25,7 @@ steps:
|
|||||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||||
|
|
||||||
- label: Entrypoints Integration (API Server 1)
|
- label: Entrypoints Integration (API Server)
|
||||||
timeout_in_minutes: 130
|
timeout_in_minutes: 130
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@@ -34,24 +34,10 @@ steps:
|
|||||||
- tests/entrypoints/test_chat_utils
|
- tests/entrypoints/test_chat_utils
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
|
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
|
||||||
|
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
|
||||||
- pytest -v -s entrypoints/test_chat_utils.py
|
- pytest -v -s entrypoints/test_chat_utils.py
|
||||||
|
|
||||||
- label: Entrypoints Integration (API Server 2)
|
|
||||||
timeout_in_minutes: 130
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/tool_use
|
|
||||||
- tests/entrypoints/sleep
|
|
||||||
- tests/entrypoints/instrumentator
|
|
||||||
- tests/entrypoints/rpc
|
|
||||||
commands:
|
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
|
|
||||||
- pytest -v -s entrypoints/instrumentator
|
|
||||||
- pytest -v -s entrypoints/sleep
|
|
||||||
- pytest -v -s tool_use
|
|
||||||
|
|
||||||
- label: Entrypoints Integration (Pooling)
|
- label: Entrypoints Integration (Pooling)
|
||||||
timeout_in_minutes: 50
|
timeout_in_minutes: 50
|
||||||
@@ -63,14 +49,6 @@ steps:
|
|||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s entrypoints/pooling
|
- pytest -v -s entrypoints/pooling
|
||||||
|
|
||||||
- label: Entrypoints Integration (Responses API)
|
|
||||||
timeout_in_minutes: 50
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/entrypoints/openai/responses
|
|
||||||
commands:
|
|
||||||
- pytest -v -s entrypoints/openai/responses
|
|
||||||
|
|
||||||
- label: Entrypoints V1
|
- label: Entrypoints V1
|
||||||
timeout_in_minutes: 50
|
timeout_in_minutes: 50
|
||||||
|
|||||||
@@ -90,8 +90,8 @@ steps:
|
|||||||
- vllm/v1/attention/backends/flashinfer.py
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
- vllm/v1/attention/backends/mla/cutlass_mla.py
|
- vllm/v1/attention/backends/mla/cutlass_mla.py
|
||||||
- vllm/v1/attention/backends/mla/flashinfer_mla.py
|
- vllm/v1/attention/backends/mla/flashinfer_mla.py
|
||||||
- vllm/v1/attention/selector.py
|
|
||||||
- vllm/platforms/cuda.py
|
- vllm/platforms/cuda.py
|
||||||
|
- vllm/attention/selector.py
|
||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
- python3 examples/offline_inference/basic/chat.py
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ steps:
|
|||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
autorun_on_main: true
|
autorun_on_main: true
|
||||||
commands:
|
commands:
|
||||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||||
|
|
||||||
- label: LM Eval Large Models (4 GPUs)(A100)
|
- label: LM Eval Large Models (4 GPUs)(A100)
|
||||||
gpu: a100
|
gpu: a100
|
||||||
@@ -43,4 +43,4 @@ steps:
|
|||||||
- csrc/
|
- csrc/
|
||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
commands:
|
commands:
|
||||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
|
||||||
|
|||||||
@@ -22,8 +22,6 @@ steps:
|
|||||||
# FIXIT: find out which code initialize cuda before running the test
|
# FIXIT: find out which code initialize cuda before running the test
|
||||||
# before the fix, we need to use spawn to test it
|
# before the fix, we need to use spawn to test it
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
# Alot of these tests are on the edge of OOMing
|
|
||||||
- export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
|
||||||
# There is some Tensor Parallelism related processing logic in LoRA that
|
# There is some Tensor Parallelism related processing logic in LoRA that
|
||||||
# requires multi-GPU testing for validation.
|
# requires multi-GPU testing for validation.
|
||||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||||
|
|||||||
@@ -9,7 +9,6 @@ steps:
|
|||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/test_initialization.py
|
- tests/models/test_initialization.py
|
||||||
- tests/models/registry.py
|
|
||||||
commands:
|
commands:
|
||||||
# Run a subset of model initialization tests
|
# Run a subset of model initialization tests
|
||||||
- pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
|
- pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
|
||||||
@@ -21,7 +20,6 @@ steps:
|
|||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/model_executor/models/
|
- vllm/model_executor/models/
|
||||||
- tests/models/test_initialization.py
|
- tests/models/test_initialization.py
|
||||||
- tests/models/registry.py
|
|
||||||
commands:
|
commands:
|
||||||
# Only when vLLM model source is modified - test initialization of a large
|
# Only when vLLM model source is modified - test initialization of a large
|
||||||
# subset of supported models (the complement of the small subset in the above
|
# subset of supported models (the complement of the small subset in the above
|
||||||
|
|||||||
@@ -13,9 +13,7 @@ steps:
|
|||||||
# tests covered elsewhere.
|
# tests covered elsewhere.
|
||||||
# Use `find` to launch multiple instances of pytest so that
|
# Use `find` to launch multiple instances of pytest so that
|
||||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||||
# However, find does not normally propagate error codes, so we combine it with xargs
|
- "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\;"
|
||||||
# (using -0 for proper path handling)
|
|
||||||
- "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
|
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Smoke Test
|
- label: PyTorch Fullgraph Smoke Test
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
|
|||||||
13
.buildkite/test_areas/tool_use.yaml
Normal file
13
.buildkite/test_areas/tool_use.yaml
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
group: Tool use
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: OpenAI-Compatible Tool Use
|
||||||
|
timeout_in_minutes: 35
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
fast_check: false
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/tool_use
|
||||||
|
commands:
|
||||||
|
- pytest -v -s tool_use
|
||||||
11
.github/CODEOWNERS
vendored
11
.github/CODEOWNERS
vendored
@@ -3,6 +3,7 @@
|
|||||||
|
|
||||||
# This lists cover the "core" components of vLLM that require careful review
|
# This lists cover the "core" components of vLLM that require careful review
|
||||||
/vllm/attention @LucasWilkinson
|
/vllm/attention @LucasWilkinson
|
||||||
|
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
|
||||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
|
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
|
||||||
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
|
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
|
||||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
|
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
|
||||||
@@ -14,7 +15,6 @@
|
|||||||
/vllm/lora @jeejeelee
|
/vllm/lora @jeejeelee
|
||||||
/vllm/reasoning @aarnphm @chaunceyjiang
|
/vllm/reasoning @aarnphm @chaunceyjiang
|
||||||
/vllm/entrypoints @aarnphm @chaunceyjiang
|
/vllm/entrypoints @aarnphm @chaunceyjiang
|
||||||
/vllm/tool_parsers @aarnphm @chaunceyjiang
|
|
||||||
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
|
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
|
||||||
/vllm/distributed/kv_transfer @NickLucche @ApostaC
|
/vllm/distributed/kv_transfer @NickLucche @ApostaC
|
||||||
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||||
@@ -26,7 +26,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
|||||||
|
|
||||||
# vLLM V1
|
# vLLM V1
|
||||||
/vllm/v1/attention @LucasWilkinson
|
/vllm/v1/attention @LucasWilkinson
|
||||||
/vllm/v1/attention/backend.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
|
|
||||||
/vllm/v1/attention/backends/mla @pavanimajety
|
/vllm/v1/attention/backends/mla @pavanimajety
|
||||||
/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
|
/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
|
||||||
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
||||||
@@ -117,15 +116,15 @@ mkdocs.yaml @hmellor
|
|||||||
/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
|
/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
|
||||||
|
|
||||||
# Kernels
|
# Kernels
|
||||||
/vllm/v1/attention/ops/chunked_prefill_paged_decode.py @tdoublep
|
/vllm/attention/ops/chunked_prefill_paged_decode.py @tdoublep
|
||||||
/vllm/v1/attention/ops/triton_unified_attention.py @tdoublep
|
/vllm/attention/ops/triton_unified_attention.py @tdoublep
|
||||||
|
|
||||||
# ROCm related: specify owner with write access to notify AMD folks for careful code review
|
# ROCm related: specify owner with write access to notify AMD folks for careful code review
|
||||||
/vllm/**/*rocm* @tjtanaa
|
/vllm/**/*rocm* @tjtanaa
|
||||||
/docker/Dockerfile.rocm* @gshtras @tjtanaa
|
/docker/Dockerfile.rocm* @gshtras @tjtanaa
|
||||||
/vllm/v1/attention/backends/rocm*.py @gshtras @tjtanaa
|
/vllm/v1/attention/backends/rocm*.py @gshtras @tjtanaa
|
||||||
/vllm/v1/attention/backends/mla/rocm*.py @gshtras @tjtanaa
|
/vllm/v1/attention/backends/mla/rocm*.py @gshtras @tjtanaa
|
||||||
/vllm/v1/attention/ops/rocm*.py @gshtras @tjtanaa
|
/vllm/attention/ops/rocm*.py @gshtras @tjtanaa
|
||||||
/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras @tjtanaa
|
/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras @tjtanaa
|
||||||
/csrc/rocm @gshtras @tjtanaa
|
/csrc/rocm @gshtras @tjtanaa
|
||||||
/requirements/*rocm* @tjtanaa
|
/requirements/*rocm* @tjtanaa
|
||||||
@@ -153,7 +152,7 @@ mkdocs.yaml @hmellor
|
|||||||
/vllm/entrypoints/pooling @noooop
|
/vllm/entrypoints/pooling @noooop
|
||||||
/vllm/config/pooler.py @noooop
|
/vllm/config/pooler.py @noooop
|
||||||
/vllm/pooling_params.py @noooop
|
/vllm/pooling_params.py @noooop
|
||||||
/vllm/model_executor/layers/pooler @noooop
|
/vllm/model_executor/layers/pooler.py @noooop
|
||||||
|
|
||||||
# Security guide and policies
|
# Security guide and policies
|
||||||
/docs/usage/security.md @russellb
|
/docs/usage/security.md @russellb
|
||||||
|
|||||||
30
.github/mergify.yml
vendored
30
.github/mergify.yml
vendored
@@ -222,10 +222,10 @@ pull_request_rules:
|
|||||||
- files~=^csrc/rocm/
|
- files~=^csrc/rocm/
|
||||||
- files~=^docker/Dockerfile.rocm
|
- files~=^docker/Dockerfile.rocm
|
||||||
- files~=^requirements/rocm.*\.txt
|
- files~=^requirements/rocm.*\.txt
|
||||||
|
- files~=^vllm/attention/backends/rocm.*\.py
|
||||||
|
- files~=^vllm/attention/ops/rocm.*\.py
|
||||||
- files~=^vllm/model_executor/layers/fused_moe/rocm.*\.py
|
- files~=^vllm/model_executor/layers/fused_moe/rocm.*\.py
|
||||||
- files~=^vllm/v1/attention/backends/rocm.*\.py
|
|
||||||
- files~=^vllm/v1/attention/backends/mla/rocm.*\.py
|
- files~=^vllm/v1/attention/backends/mla/rocm.*\.py
|
||||||
- files~=^vllm/v1/attention/ops/rocm.*\.py
|
|
||||||
- files~=^tests/kernels/.*_rocm.*\.py
|
- files~=^tests/kernels/.*_rocm.*\.py
|
||||||
- files=vllm/platforms/rocm.py
|
- files=vllm/platforms/rocm.py
|
||||||
- title~=(?i)AMD
|
- title~=(?i)AMD
|
||||||
@@ -235,20 +235,6 @@ pull_request_rules:
|
|||||||
add:
|
add:
|
||||||
- rocm
|
- rocm
|
||||||
|
|
||||||
- name: label-cpu
|
|
||||||
description: Automatically apply cpu label
|
|
||||||
conditions:
|
|
||||||
- label != stale
|
|
||||||
- files~=^(?!.*kv_offload)(?!.*cpu_offload).*\bcpu.*
|
|
||||||
actions:
|
|
||||||
label:
|
|
||||||
add:
|
|
||||||
- cpu
|
|
||||||
assign:
|
|
||||||
users:
|
|
||||||
- "fadara01"
|
|
||||||
- "aditew01"
|
|
||||||
|
|
||||||
- name: label-structured-output
|
- name: label-structured-output
|
||||||
description: Automatically apply structured-output label
|
description: Automatically apply structured-output label
|
||||||
conditions:
|
conditions:
|
||||||
@@ -349,18 +335,6 @@ pull_request_rules:
|
|||||||
add:
|
add:
|
||||||
- tool-calling
|
- tool-calling
|
||||||
|
|
||||||
- name: auto-rebase if approved, ready, and 40 commits behind main
|
|
||||||
conditions:
|
|
||||||
- base = main
|
|
||||||
- label=ready
|
|
||||||
- "#approved-reviews-by >= 1"
|
|
||||||
- "#commits-behind >= 40"
|
|
||||||
- -closed
|
|
||||||
- -draft
|
|
||||||
- -conflict
|
|
||||||
actions:
|
|
||||||
rebase: {}
|
|
||||||
|
|
||||||
- name: ping author on conflicts and add 'needs-rebase' label
|
- name: ping author on conflicts and add 'needs-rebase' label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
- label != stale
|
||||||
|
|||||||
5
.gitignore
vendored
5
.gitignore
vendored
@@ -227,8 +227,3 @@ ep_kernels_workspace/
|
|||||||
|
|
||||||
# Allow tracked library source folders under submodules (e.g., benchmarks/lib)
|
# Allow tracked library source folders under submodules (e.g., benchmarks/lib)
|
||||||
!vllm/benchmarks/lib/
|
!vllm/benchmarks/lib/
|
||||||
|
|
||||||
# Generated gRPC protobuf files (compiled at build time from vllm_engine.proto)
|
|
||||||
vllm/grpc/vllm_engine_pb2.py
|
|
||||||
vllm/grpc/vllm_engine_pb2_grpc.py
|
|
||||||
vllm/grpc/vllm_engine_pb2.pyi
|
|
||||||
|
|||||||
130
CMakeLists.txt
130
CMakeLists.txt
@@ -56,8 +56,8 @@ endif()
|
|||||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||||
# versions are derived from docker/Dockerfile.rocm
|
# versions are derived from docker/Dockerfile.rocm
|
||||||
#
|
#
|
||||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.9.1")
|
set(TORCH_SUPPORTED_VERSION_CUDA "2.9.0")
|
||||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.9.1")
|
set(TORCH_SUPPORTED_VERSION_ROCM "2.9.0")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Try to find python package with an executable that exactly matches
|
# Try to find python package with an executable that exactly matches
|
||||||
@@ -282,7 +282,6 @@ endif()
|
|||||||
set(VLLM_EXT_SRC
|
set(VLLM_EXT_SRC
|
||||||
"csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
|
"csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
|
||||||
"csrc/cache_kernels.cu"
|
"csrc/cache_kernels.cu"
|
||||||
"csrc/cache_kernels_fused.cu"
|
|
||||||
"csrc/attention/paged_attention_v1.cu"
|
"csrc/attention/paged_attention_v1.cu"
|
||||||
"csrc/attention/paged_attention_v2.cu"
|
"csrc/attention/paged_attention_v2.cu"
|
||||||
"csrc/attention/merge_attn_states.cu"
|
"csrc/attention/merge_attn_states.cu"
|
||||||
@@ -358,8 +357,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
|
|
||||||
# marlin arches for fp16 output
|
# marlin arches for fp16 output
|
||||||
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
|
||||||
# marlin has limited support for turing
|
|
||||||
cuda_archs_loose_intersection(MARLIN_SM75_ARCHS "7.5" "${CUDA_ARCHS}")
|
|
||||||
# marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX)
|
# marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX)
|
||||||
cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
|
||||||
# marlin arches for fp8 input
|
# marlin arches for fp8 input
|
||||||
@@ -367,10 +364,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
|
# - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
|
||||||
# so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0)
|
# so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0)
|
||||||
cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
|
||||||
# marlin arches for other files
|
|
||||||
cuda_archs_loose_intersection(MARLIN_OTHER_ARCHS "7.5;8.0+PTX" "${CUDA_ARCHS}")
|
|
||||||
|
|
||||||
if (MARLIN_OTHER_ARCHS)
|
if (MARLIN_ARCHS)
|
||||||
|
|
||||||
#
|
#
|
||||||
# For the Marlin kernels we automatically generate sources for various
|
# For the Marlin kernels we automatically generate sources for various
|
||||||
@@ -411,39 +406,25 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
message(STATUS "Marlin generation script has not changed, skipping generation.")
|
message(STATUS "Marlin generation script has not changed, skipping generation.")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (MARLIN_ARCHS)
|
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
|
||||||
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
|
set_gencode_flags_for_srcs(
|
||||||
set_gencode_flags_for_srcs(
|
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
|
||||||
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
|
CUDA_ARCHS "${MARLIN_ARCHS}")
|
||||||
CUDA_ARCHS "${MARLIN_ARCHS}")
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
|
||||||
set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
|
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
||||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
|
||||||
endif()
|
|
||||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
|
|
||||||
|
|
||||||
file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
|
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
|
|
||||||
CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
|
||||||
set_source_files_properties(${MARLIN_TEMPLATE_BF16_KERNEL_SRC}
|
|
||||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
|
||||||
endif()
|
|
||||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
|
|
||||||
endif()
|
endif()
|
||||||
|
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
|
||||||
|
|
||||||
if (MARLIN_SM75_ARCHS)
|
file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
|
||||||
file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/gptq_marlin/sm75_kernel_*.cu")
|
set_gencode_flags_for_srcs(
|
||||||
set_gencode_flags_for_srcs(
|
SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
|
||||||
SRCS "${MARLIN_TEMPLATE_SM75_KERNEL_SRC}"
|
CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
|
||||||
CUDA_ARCHS "${MARLIN_SM75_ARCHS}")
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
set_source_files_properties(${MARLIN_TEMPLATE_BF16_KERNEL_SRC}
|
||||||
set_source_files_properties(${MARLIN_TEMPLATE_SM75_KERNEL_SRC}
|
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
||||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
|
||||||
endif()
|
|
||||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_SM75_KERNEL_SRC})
|
|
||||||
endif()
|
endif()
|
||||||
|
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
|
||||||
|
|
||||||
if (MARLIN_FP8_ARCHS)
|
if (MARLIN_FP8_ARCHS)
|
||||||
file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
|
file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
|
||||||
@@ -465,14 +446,14 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
|
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${MARLIN_SRCS}"
|
SRCS "${MARLIN_SRCS}"
|
||||||
CUDA_ARCHS "${MARLIN_OTHER_ARCHS}")
|
CUDA_ARCHS "${MARLIN_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
||||||
set_source_files_properties(${MARLIN_SRCS}
|
set_source_files_properties("csrc/quantization/gptq_marlin/gptq_marlin.cu"
|
||||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
||||||
endif()
|
endif()
|
||||||
list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
|
list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
|
||||||
|
|
||||||
message(STATUS "Building Marlin kernels for archs: ${MARLIN_OTHER_ARCHS}")
|
message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
|
||||||
else()
|
else()
|
||||||
message(STATUS "Not building Marlin kernels as no compatible archs found"
|
message(STATUS "Not building Marlin kernels as no compatible archs found"
|
||||||
" in CUDA target architectures")
|
" in CUDA target architectures")
|
||||||
@@ -800,6 +781,24 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
else()
|
else()
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
|
||||||
endif()
|
endif()
|
||||||
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
|
set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${SRCS}"
|
||||||
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||||
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
|
||||||
|
message(STATUS "Building blockwise_scaled_group_mm_sm100 for archs: ${SCALED_MM_ARCHS}")
|
||||||
|
else()
|
||||||
|
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
|
message(STATUS "Not building blockwise_scaled_group_mm_sm100 kernels as CUDA Compiler version is "
|
||||||
|
"not >= 12.8, we recommend upgrading to CUDA 12.8 or later "
|
||||||
|
"if you intend on running FP8 quantized MoE models on Blackwell.")
|
||||||
|
else()
|
||||||
|
message(STATUS "Not building blockwise_scaled_group_mm_sm100 as no compatible archs found "
|
||||||
|
"in CUDA target architectures")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
# Machete kernels
|
# Machete kernels
|
||||||
@@ -981,16 +980,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# note that we always set `use_atomic_add=False` for moe marlin now,
|
# note that we always set `use_atomic_add=False` for moe marlin now,
|
||||||
# so we don't need 9.0 for bf16 atomicAdd PTX
|
# so we don't need 9.0 for bf16 atomicAdd PTX
|
||||||
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
|
||||||
# moe marlin has limited support for turing
|
|
||||||
cuda_archs_loose_intersection(MARLIN_MOE_SM75_ARCHS "7.5" "${CUDA_ARCHS}")
|
|
||||||
# moe marlin arches for fp8 input
|
# moe marlin arches for fp8 input
|
||||||
# - sm80 doesn't support fp8 computation
|
# - sm80 doesn't support fp8 computation
|
||||||
# - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
|
# - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
|
||||||
# so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0)
|
# so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0)
|
||||||
cuda_archs_loose_intersection(MARLIN_MOE_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(MARLIN_MOE_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
|
||||||
# moe marlin arches for other files
|
if (MARLIN_MOE_ARCHS)
|
||||||
cuda_archs_loose_intersection(MARLIN_MOE_OTHER_ARCHS "7.5;8.0+PTX" "${CUDA_ARCHS}")
|
|
||||||
if (MARLIN_MOE_OTHER_ARCHS)
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# For the Marlin MOE kernels we automatically generate sources for various
|
# For the Marlin MOE kernels we automatically generate sources for various
|
||||||
@@ -1031,29 +1026,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
|
message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (MARLIN_MOE_ARCHS)
|
file(GLOB MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu")
|
||||||
file(GLOB MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu")
|
list(APPEND MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/ops.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${MARLIN_MOE_SRC}"
|
SRCS "${MARLIN_MOE_SRC}"
|
||||||
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
|
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
||||||
set_source_files_properties(${MARLIN_MOE_SRC}
|
set_source_files_properties(${MARLIN_MOE_SRC}
|
||||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
||||||
endif()
|
|
||||||
list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (MARLIN_MOE_SM75_ARCHS)
|
|
||||||
file(GLOB MARLIN_MOE_SM75_SRC "csrc/moe/marlin_moe_wna16/sm75_kernel_*.cu")
|
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${MARLIN_MOE_SM75_SRC}"
|
|
||||||
CUDA_ARCHS "${MARLIN_MOE_SM75_ARCHS}")
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
|
||||||
set_source_files_properties(${MARLIN_MOE_SM75_SRC}
|
|
||||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
|
||||||
endif()
|
|
||||||
list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SM75_SRC})
|
|
||||||
endif()
|
endif()
|
||||||
|
list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
|
||||||
|
|
||||||
if (MARLIN_MOE_FP8_ARCHS)
|
if (MARLIN_MOE_FP8_ARCHS)
|
||||||
file(GLOB MARLIN_MOE_FP8_SRC "csrc/moe/marlin_moe_wna16/sm89_kernel_*.cu")
|
file(GLOB MARLIN_MOE_FP8_SRC "csrc/moe/marlin_moe_wna16/sm89_kernel_*.cu")
|
||||||
@@ -1067,17 +1049,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_FP8_SRC})
|
list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_FP8_SRC})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(MARLIN_MOE_OTHER_SRC "csrc/moe/marlin_moe_wna16/ops.cu")
|
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${MARLIN_MOE_OTHER_SRC}"
|
|
||||||
CUDA_ARCHS "${MARLIN_MOE_OTHER_ARCHS}")
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
|
||||||
set_source_files_properties(${MARLIN_MOE_OTHER_SRC}
|
|
||||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
|
||||||
endif()
|
|
||||||
list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_OTHER_SRC}")
|
|
||||||
|
|
||||||
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_OTHER_ARCHS}")
|
|
||||||
else()
|
else()
|
||||||
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
|
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
|
||||||
" in CUDA target architectures")
|
" in CUDA target architectures")
|
||||||
|
|||||||
93
README.md
93
README.md
@@ -14,8 +14,51 @@ Easy, fast, and cheap LLM serving for everyone
|
|||||||
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
|
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
🔥 We have built a vllm website to help you get started with vllm. Please visit [vllm.ai](https://vllm.ai) to learn more.
|
---
|
||||||
For events, please visit [vllm.ai/events](https://vllm.ai/events) to join us.
|
Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundation.org/pytorch-conference/) and [Ray Summit, November 3-5](https://www.anyscale.com/ray-summit/2025) in San Francisco for our latest updates on vLLM and to meet the vLLM team! Register now for the largest vLLM community events of the year!
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
*Latest News* 🔥
|
||||||
|
|
||||||
|
- [2025/11] We hosted [vLLM Bangkok Meetup](https://luma.com/v0f647nv). We explored vLLM and LMCache inference and low-resource language adaptation with speakers from Embedded LLM, AMD, and Red Hat. Please find the meetup slides [here](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing).
|
||||||
|
- [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
|
||||||
|
- [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
|
||||||
|
- [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
|
||||||
|
- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
|
||||||
|
- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
|
||||||
|
- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
|
||||||
|
- [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
|
||||||
|
- [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
|
||||||
|
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Previous News</summary>
|
||||||
|
|
||||||
|
- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
|
||||||
|
- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
|
||||||
|
- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
|
||||||
|
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
|
||||||
|
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
|
||||||
|
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
|
||||||
|
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
|
||||||
|
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
|
||||||
|
- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
|
||||||
|
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
|
||||||
|
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
|
||||||
|
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
|
||||||
|
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
|
||||||
|
- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
|
||||||
|
- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
|
||||||
|
- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
|
||||||
|
- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
|
||||||
|
- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
|
||||||
|
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) with IBM! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
|
||||||
|
- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) with a16z! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
|
||||||
|
- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
|
||||||
|
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@@ -75,6 +118,50 @@ Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
|
|||||||
We welcome and value any contributions and collaborations.
|
We welcome and value any contributions and collaborations.
|
||||||
Please check out [Contributing to vLLM](https://docs.vllm.ai/en/latest/contributing/index.html) for how to get involved.
|
Please check out [Contributing to vLLM](https://docs.vllm.ai/en/latest/contributing/index.html) for how to get involved.
|
||||||
|
|
||||||
|
## Sponsors
|
||||||
|
|
||||||
|
vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
|
||||||
|
|
||||||
|
<!-- Note: Please sort them in alphabetical order. -->
|
||||||
|
<!-- Note: Please keep these consistent with docs/community/sponsors.md -->
|
||||||
|
Cash Donations:
|
||||||
|
|
||||||
|
- a16z
|
||||||
|
- Dropbox
|
||||||
|
- Sequoia Capital
|
||||||
|
- Skywork AI
|
||||||
|
- ZhenFund
|
||||||
|
|
||||||
|
Compute Resources:
|
||||||
|
|
||||||
|
- Alibaba Cloud
|
||||||
|
- AMD
|
||||||
|
- Anyscale
|
||||||
|
- Arm
|
||||||
|
- AWS
|
||||||
|
- Crusoe Cloud
|
||||||
|
- Databricks
|
||||||
|
- DeepInfra
|
||||||
|
- Google Cloud
|
||||||
|
- IBM
|
||||||
|
- Intel
|
||||||
|
- Lambda Lab
|
||||||
|
- Nebius
|
||||||
|
- Novita AI
|
||||||
|
- NVIDIA
|
||||||
|
- Red Hat
|
||||||
|
- Replicate
|
||||||
|
- Roblox
|
||||||
|
- RunPod
|
||||||
|
- Trainy
|
||||||
|
- UC Berkeley
|
||||||
|
- UC San Diego
|
||||||
|
- Volcengine
|
||||||
|
|
||||||
|
Slack Sponsor: Anyscale
|
||||||
|
|
||||||
|
We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
|
||||||
|
|
||||||
## Citation
|
## Citation
|
||||||
|
|
||||||
If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
|
If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
|
||||||
@@ -95,7 +182,7 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
|
|||||||
- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
|
- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
|
||||||
- For coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
|
- For coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
|
||||||
- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
|
- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
|
||||||
- For collaborations and partnerships, please contact us at [collaboration@vllm.ai](mailto:collaboration@vllm.ai)
|
- For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)
|
||||||
<!-- --8<-- [end:contact-us] -->
|
<!-- --8<-- [end:contact-us] -->
|
||||||
|
|
||||||
## Media Kit
|
## Media Kit
|
||||||
|
|||||||
47
RELEASE.md
47
RELEASE.md
@@ -1,30 +1,47 @@
|
|||||||
# Releasing vLLM
|
# Releasing vLLM
|
||||||
|
|
||||||
vLLM releases offer a reliable version of the code base, packaged into a binary format that can be conveniently accessed via [PyPI](https://pypi.org/project/vllm). These releases also serve as key milestones for the development team to communicate with the community about newly available features, improvements, and upcoming changes that could affect users, including potential breaking changes.
|
vLLM releases offer a reliable version of the code base, packaged into a binary format that can be conveniently accessed via PyPI. These releases also serve as key milestones for the development team to communicate with the community about newly available features, improvements, and upcoming changes that could affect users, including potential breaking changes.
|
||||||
|
|
||||||
## Release Cadence and Versioning
|
## Release Versioning
|
||||||
|
|
||||||
We aim to have a regular release every 2 weeks. Since v0.12.0, regular releases increment the minor version rather than patch version. The list of past releases can be found [here](https://vllm.ai/releases).
|
vLLM uses a “right-shifted” versioning scheme where a new patch release is out every 2 weeks. And patch releases contain features and bug fixes (as opposed to semver where patch release contains only backwards-compatible bug fixes). When critical fixes need to be made, special release post1 is released.
|
||||||
|
|
||||||
Our version numbers are expressed in the form `vX.Y.Z`, where `X` is the major version, `Y` is the minor version, and `Z` is the patch version. They are incremented according to the following rules:
|
* _major_ major architectural milestone and when incompatible API changes are made, similar to PyTorch 2.0.
|
||||||
|
* _minor_ major features
|
||||||
|
* _patch_ features and backwards-compatible bug fixes
|
||||||
|
* _post1_ or _patch-1_ backwards-compatible bug fixes, either explicit or implicit post release
|
||||||
|
|
||||||
* _Major_ releases are reserved for architectural milestones involving sweeping API changes, similar to PyTorch 2.0.
|
## Release Cadence
|
||||||
* _Minor_ releases correspond to regular releases, which include new features, bug fixes and other backwards-compatible changes.
|
|
||||||
* _Patch_ releases correspond to special releases for new models, as well as emergency patches for critical performance, functionality and security issues.
|
|
||||||
|
|
||||||
This versioning scheme is similar to [SemVer](https://semver.org/) for compatibility purposes, except that backwards compatibility is only guaranteed for a limited number of minor releases (see our [deprecation policy](https://docs.vllm.ai/en/latest/contributing/deprecation_policy) for details).
|
Patch release is released on bi-weekly basis. Post release 1-3 days after patch release and uses same branch as patch release.
|
||||||
|
Following is the release cadence for year 2025. All future release dates below are tentative. Please note: Post releases are optional.
|
||||||
|
|
||||||
## Release Branch
|
| Release Date | Patch release versions | Post Release versions |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| Jan 2025 | 0.7.0 | --- |
|
||||||
|
| Feb 2025 | 0.7.1, 0.7.2, 0.7.3 | --- |
|
||||||
|
| Mar 2025 | 0.7.4, 0.7.5 | --- |
|
||||||
|
| Apr 2025 | 0.7.6, 0.7.7 | --- |
|
||||||
|
| May 2025 | 0.7.8, 0.7.9 | --- |
|
||||||
|
| Jun 2025 | 0.7.10, 0.7.11 | --- |
|
||||||
|
| Jul 2025 | 0.7.12, 0.7.13 | --- |
|
||||||
|
| Aug 2025 | 0.7.14, 0.7.15 | --- |
|
||||||
|
| Sep 2025 | 0.7.16, 0.7.17 | --- |
|
||||||
|
| Oct 2025 | 0.7.18, 0.7.19 | --- |
|
||||||
|
| Nov 2025 | 0.7.20, 0.7.21 | --- |
|
||||||
|
| Dec 2025 | 0.7.22, 0.7.23 | --- |
|
||||||
|
|
||||||
|
## Release branch
|
||||||
|
|
||||||
Each release is built from a dedicated release branch.
|
Each release is built from a dedicated release branch.
|
||||||
|
|
||||||
* For _major_ and _minor_ releases, the release branch cut is performed 1-2 days before release is live.
|
* For _major_, _minor_, _patch_ releases, the release branch cut is performed 1-2 days before release is live.
|
||||||
* For _patch_ releases, previously cut release branch is reused.
|
* For post releases, previously cut release branch is reused
|
||||||
* Release builds are triggered via push to RC tag like `vX.Y.Z-rc1`. This enables us to build and test multiple RCs for each release.
|
* Release builds are triggered via push to RC tag like vX.Y.Z-rc1 . This enables us to build and test multiple RCs for each release.
|
||||||
* Final tag: `vX.Y.Z` does not trigger the build but used for Release notes and assets.
|
* Final tag : vX.Y.Z does not trigger the build but used for Release notes and assets.
|
||||||
* After branch cut is created, we monitor the main branch for any reverts and apply these reverts to a release branch.
|
* After branch cut is created we monitor the main branch for any reverts and apply these reverts to a release branch.
|
||||||
|
|
||||||
### Cherry-Pick Criteria
|
## Release Cherry-Pick Criteria
|
||||||
|
|
||||||
After branch cut, we approach finalizing the release branch with clear criteria on what cherry picks are allowed in. Note: a cherry pick is a process to land a PR in the release branch after branch cut. These are typically limited to ensure that the team has sufficient time to complete a thorough round of testing on a stable code base.
|
After branch cut, we approach finalizing the release branch with clear criteria on what cherry picks are allowed in. Note: a cherry pick is a process to land a PR in the release branch after branch cut. These are typically limited to ensure that the team has sufficient time to complete a thorough round of testing on a stable code base.
|
||||||
|
|
||||||
|
|||||||
@@ -104,6 +104,7 @@ def run_benchmark_with_batch_invariant(
|
|||||||
random.seed(seed)
|
random.seed(seed)
|
||||||
|
|
||||||
# Set environment variables
|
# Set environment variables
|
||||||
|
os.environ["VLLM_ATTENTION_BACKEND"] = backend
|
||||||
if batch_invariant:
|
if batch_invariant:
|
||||||
os.environ["VLLM_BATCH_INVARIANT"] = "1"
|
os.environ["VLLM_BATCH_INVARIANT"] = "1"
|
||||||
else:
|
else:
|
||||||
@@ -139,7 +140,6 @@ def run_benchmark_with_batch_invariant(
|
|||||||
max_model_len=max_model_len,
|
max_model_len=max_model_len,
|
||||||
dtype="bfloat16",
|
dtype="bfloat16",
|
||||||
tensor_parallel_size=tp_size,
|
tensor_parallel_size=tp_size,
|
||||||
attention_config={"backend": backend},
|
|
||||||
enable_prefix_caching=False,
|
enable_prefix_caching=False,
|
||||||
)
|
)
|
||||||
init_time = time.perf_counter() - start_init
|
init_time = time.perf_counter() - start_init
|
||||||
|
|||||||
@@ -135,6 +135,7 @@ def benchmark_batched_propose(args):
|
|||||||
block_sizes=[16],
|
block_sizes=[16],
|
||||||
)
|
)
|
||||||
dummy_input_batch._req_ids = list(str(id) for id in range(args.num_req))
|
dummy_input_batch._req_ids = list(str(id) for id in range(args.num_req))
|
||||||
|
dummy_input_batch.spec_decode_unsupported_reqs = ()
|
||||||
dummy_input_batch.num_tokens_no_spec = [args.num_token] * args.num_req
|
dummy_input_batch.num_tokens_no_spec = [args.num_token] * args.num_req
|
||||||
dummy_input_batch.token_ids_cpu = np.random.randint(
|
dummy_input_batch.token_ids_cpu = np.random.randint(
|
||||||
0, 20, (args.num_req, args.num_token)
|
0, 20, (args.num_req, args.num_token)
|
||||||
@@ -150,8 +151,10 @@ def benchmark_batched_propose(args):
|
|||||||
start = time.time()
|
start = time.time()
|
||||||
runner.drafter.propose(
|
runner.drafter.propose(
|
||||||
sampled_token_ids,
|
sampled_token_ids,
|
||||||
|
dummy_input_batch.req_ids,
|
||||||
dummy_input_batch.num_tokens_no_spec,
|
dummy_input_batch.num_tokens_no_spec,
|
||||||
dummy_input_batch.token_ids_cpu,
|
dummy_input_batch.token_ids_cpu,
|
||||||
|
dummy_input_batch.spec_decode_unsupported_reqs,
|
||||||
)
|
)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
print(f"Iteration time (s): {end - start}")
|
print(f"Iteration time (s): {end - start}")
|
||||||
|
|||||||
@@ -343,9 +343,7 @@ def bench(
|
|||||||
return bench_int8(dtype, m, k, n, label, sub_label)
|
return bench_int8(dtype, m, k, n, label, sub_label)
|
||||||
if dtype == torch.float8_e4m3fn:
|
if dtype == torch.float8_e4m3fn:
|
||||||
return bench_fp8(dtype, m, k, n, label, sub_label)
|
return bench_fp8(dtype, m, k, n, label, sub_label)
|
||||||
raise ValueError(
|
raise ValueError("unsupported type")
|
||||||
f"Unsupported dtype {dtype}: should be one of torch.int8, torch.float8_e4m3fn."
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# runner
|
# runner
|
||||||
|
|||||||
@@ -1,177 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
import argparse
|
|
||||||
import copy
|
|
||||||
import itertools
|
|
||||||
|
|
||||||
import torch
|
|
||||||
from weight_shapes import WEIGHT_SHAPES
|
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
|
||||||
from vllm.platforms import current_platform
|
|
||||||
from vllm.scalar_type import scalar_types
|
|
||||||
from vllm.triton_utils import triton
|
|
||||||
from vllm.utils.flashinfer import flashinfer_fp4_quantize
|
|
||||||
|
|
||||||
if not current_platform.has_device_capability(100):
|
|
||||||
raise RuntimeError("NVFP4 requires compute capability of 10.0 (Blackwell)")
|
|
||||||
|
|
||||||
FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
|
|
||||||
FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
|
|
||||||
|
|
||||||
PROVIDER_CFGS = {
|
|
||||||
"vllm": dict(backend="vllm", enabled=True),
|
|
||||||
"flashinfer": dict(backend="flashinfer", enabled=True),
|
|
||||||
}
|
|
||||||
|
|
||||||
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
|
|
||||||
|
|
||||||
|
|
||||||
def compute_global_scale(tensor: torch.Tensor) -> torch.Tensor:
|
|
||||||
"""Compute global scale for FP4 quantization."""
|
|
||||||
amax = torch.abs(tensor).max().to(torch.float32)
|
|
||||||
return FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / amax
|
|
||||||
|
|
||||||
|
|
||||||
@triton.testing.perf_report(
|
|
||||||
triton.testing.Benchmark(
|
|
||||||
x_names=["batch_size"],
|
|
||||||
x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096],
|
|
||||||
x_log=False,
|
|
||||||
line_arg="provider",
|
|
||||||
line_vals=_enabled,
|
|
||||||
line_names=_enabled,
|
|
||||||
ylabel="us (lower is better)",
|
|
||||||
plot_name="NVFP4 Input Quantization Latency (us)",
|
|
||||||
args={},
|
|
||||||
)
|
|
||||||
)
|
|
||||||
def benchmark(batch_size, provider, N, K):
|
|
||||||
M = batch_size
|
|
||||||
device = "cuda"
|
|
||||||
dtype = torch.bfloat16
|
|
||||||
|
|
||||||
# Create input tensor
|
|
||||||
a = torch.randn((M, K), device=device, dtype=dtype)
|
|
||||||
|
|
||||||
# Compute global scale for activation
|
|
||||||
a_global_scale = compute_global_scale(a)
|
|
||||||
|
|
||||||
quantiles = [0.5, 0.2, 0.8]
|
|
||||||
|
|
||||||
cfg = PROVIDER_CFGS[provider]
|
|
||||||
|
|
||||||
if cfg["backend"] == "vllm":
|
|
||||||
# vLLM's FP4 quantization
|
|
||||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
|
||||||
lambda: ops.scaled_fp4_quant(a, a_global_scale),
|
|
||||||
quantiles=quantiles,
|
|
||||||
)
|
|
||||||
elif cfg["backend"] == "flashinfer":
|
|
||||||
# FlashInfer's FP4 quantization
|
|
||||||
# Use is_sf_swizzled_layout=True to match vLLM's output format
|
|
||||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
|
||||||
lambda: flashinfer_fp4_quantize(
|
|
||||||
a, a_global_scale, is_sf_swizzled_layout=True
|
|
||||||
),
|
|
||||||
quantiles=quantiles,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Convert ms to us for better readability at small batch sizes
|
|
||||||
to_us = lambda t_ms: t_ms * 1000
|
|
||||||
return to_us(ms), to_us(max_ms), to_us(min_ms)
|
|
||||||
|
|
||||||
|
|
||||||
def prepare_shapes(args):
|
|
||||||
out = []
|
|
||||||
for model, tp_size in itertools.product(args.models, args.tp_sizes):
|
|
||||||
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
|
|
||||||
KN[tp_dim] //= tp_size
|
|
||||||
KN.append(model)
|
|
||||||
out.append(KN)
|
|
||||||
return out
|
|
||||||
|
|
||||||
|
|
||||||
def _test_accuracy_once(M: int, K: int, dtype: torch.dtype, device: str):
|
|
||||||
"""Test accuracy between vLLM and FlashInfer FP4 quantization."""
|
|
||||||
# Create input tensor
|
|
||||||
a = torch.randn((M, K), device=device, dtype=dtype)
|
|
||||||
|
|
||||||
# Compute global scale
|
|
||||||
a_global_scale = compute_global_scale(a)
|
|
||||||
|
|
||||||
# vLLM quantization
|
|
||||||
vllm_fp4, vllm_scale = ops.scaled_fp4_quant(a, a_global_scale)
|
|
||||||
|
|
||||||
# FlashInfer quantization (with swizzled layout to match vLLM's output)
|
|
||||||
flashinfer_fp4, flashinfer_scale = flashinfer_fp4_quantize(
|
|
||||||
a, a_global_scale, is_sf_swizzled_layout=True
|
|
||||||
)
|
|
||||||
flashinfer_scale = flashinfer_scale.view(torch.float8_e4m3fn)
|
|
||||||
|
|
||||||
# Compare outputs
|
|
||||||
torch.testing.assert_close(
|
|
||||||
vllm_fp4,
|
|
||||||
flashinfer_fp4,
|
|
||||||
)
|
|
||||||
print(f"M={M}, K={K}, dtype={dtype}: PASSED")
|
|
||||||
|
|
||||||
|
|
||||||
def test_accuracy():
|
|
||||||
"""Run accuracy tests across various shapes."""
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print("Running accuracy tests: vLLM vs FlashInfer")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
device = "cuda"
|
|
||||||
dtype = torch.bfloat16
|
|
||||||
|
|
||||||
# Test various batch sizes and hidden dimensions
|
|
||||||
Ms = [1, 1024]
|
|
||||||
Ks = [4096]
|
|
||||||
|
|
||||||
for M in Ms:
|
|
||||||
for K in Ks:
|
|
||||||
_test_accuracy_once(M, K, dtype, device)
|
|
||||||
|
|
||||||
print("\nAll accuracy tests passed!")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="Benchmark NVFP4 quantization: vLLM vs FlashInfer"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--models",
|
|
||||||
nargs="+",
|
|
||||||
type=str,
|
|
||||||
default=["meta-llama/Llama-3.1-8B-Instruct"],
|
|
||||||
choices=list(WEIGHT_SHAPES.keys()),
|
|
||||||
)
|
|
||||||
parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
|
|
||||||
parser.add_argument(
|
|
||||||
"--save-path",
|
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
help="Path to save benchmark results",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--accuracy",
|
|
||||||
action="store_true",
|
|
||||||
help="Run accuracy tests",
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
if args.accuracy:
|
|
||||||
test_accuracy()
|
|
||||||
|
|
||||||
for K, N, model in prepare_shapes(args):
|
|
||||||
print(f"\n{model}, N={N} K={K}")
|
|
||||||
benchmark.run(
|
|
||||||
print_data=True,
|
|
||||||
save_path=args.save_path,
|
|
||||||
N=N,
|
|
||||||
K=K,
|
|
||||||
)
|
|
||||||
|
|
||||||
print("\nBenchmark finished!")
|
|
||||||
@@ -8,12 +8,13 @@ import torch
|
|||||||
|
|
||||||
import vllm.model_executor.layers.activation # noqa F401
|
import vllm.model_executor.layers.activation # noqa F401
|
||||||
from vllm.model_executor.custom_op import CustomOp
|
from vllm.model_executor.custom_op import CustomOp
|
||||||
|
from vllm.platforms import current_platform
|
||||||
from vllm.triton_utils import triton
|
from vllm.triton_utils import triton
|
||||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
|
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||||
|
|
||||||
batch_size_range = [1, 16, 128]
|
batch_size_range = [1, 16, 32, 64, 128]
|
||||||
seq_len_range = [1, 16, 64, 1024, 4096]
|
seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
|
||||||
intermediate_size = [3072, 9728, 12288]
|
intermediate_size = [3072, 9728, 12288]
|
||||||
configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size))
|
configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size))
|
||||||
|
|
||||||
@@ -29,7 +30,7 @@ def benchmark_activation(
|
|||||||
device = "cuda"
|
device = "cuda"
|
||||||
num_tokens = batch_size * seq_len
|
num_tokens = batch_size * seq_len
|
||||||
dim = intermediate_size
|
dim = intermediate_size
|
||||||
set_random_seed(42)
|
current_platform.seed_everything(42)
|
||||||
torch.set_default_device(device)
|
torch.set_default_device(device)
|
||||||
|
|
||||||
if func_name == "gelu_and_mul":
|
if func_name == "gelu_and_mul":
|
||||||
|
|||||||
@@ -11,23 +11,16 @@ import nvtx
|
|||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as benchmark
|
import torch.utils.benchmark as benchmark
|
||||||
|
|
||||||
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
||||||
from vllm.model_executor.layers.fused_moe.config import (
|
from vllm.model_executor.layers.fused_moe.config import (
|
||||||
fp8_w8a8_moe_quant_config,
|
fp8_w8a8_moe_quant_config,
|
||||||
nvfp4_moe_quant_config,
|
nvfp4_moe_quant_config,
|
||||||
)
|
)
|
||||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import (
|
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
|
||||||
CutlassExpertsFp4,
|
|
||||||
)
|
|
||||||
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
|
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
|
||||||
from vllm.model_executor.layers.fused_moe.prepare_finalize import (
|
|
||||||
MoEPrepareAndFinalizeNoEP,
|
|
||||||
)
|
|
||||||
from vllm.scalar_type import scalar_types
|
from vllm.scalar_type import scalar_types
|
||||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
from vllm.v1.worker.workspace import init_workspace_manager
|
|
||||||
|
|
||||||
WEIGHT_SHAPES_MOE = {
|
WEIGHT_SHAPES_MOE = {
|
||||||
"nvidia/DeepSeek-R1-FP4": [
|
"nvidia/DeepSeek-R1-FP4": [
|
||||||
@@ -194,24 +187,19 @@ def bench_run(
|
|||||||
g1_alphas=w1_gs,
|
g1_alphas=w1_gs,
|
||||||
g2_alphas=w2_gs,
|
g2_alphas=w2_gs,
|
||||||
)
|
)
|
||||||
|
|
||||||
kernel = mk.FusedMoEModularKernel(
|
|
||||||
MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
|
|
||||||
CutlassExpertsFp4(
|
|
||||||
out_dtype=dtype,
|
|
||||||
max_experts_per_worker=e,
|
|
||||||
quant_config=quant_config,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
for _ in range(num_repeats):
|
for _ in range(num_repeats):
|
||||||
with nvtx.annotate("cutlass_moe_fp4", color="green"):
|
with nvtx.annotate("cutlass_moe_fp4", color="green"):
|
||||||
kernel(
|
cutlass_moe_fp4(
|
||||||
hidden_states=a,
|
a=a,
|
||||||
w1=w1_fp4,
|
w1_fp4=w1_fp4,
|
||||||
w2=w2_fp4,
|
w2_fp4=w2_fp4,
|
||||||
topk_weights=topk_weights,
|
topk_weights=topk_weights,
|
||||||
topk_ids=topk_ids,
|
topk_ids=topk_ids,
|
||||||
|
m=m,
|
||||||
|
n=n,
|
||||||
|
k=k,
|
||||||
|
e=num_experts,
|
||||||
|
quant_config=quant_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
def run_cutlass_from_graph(
|
def run_cutlass_from_graph(
|
||||||
@@ -241,24 +229,20 @@ def bench_run(
|
|||||||
g2_alphas=w2_gs,
|
g2_alphas=w2_gs,
|
||||||
)
|
)
|
||||||
|
|
||||||
kernel = mk.FusedMoEModularKernel(
|
|
||||||
MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
|
|
||||||
CutlassExpertsFp4(
|
|
||||||
out_dtype=dtype,
|
|
||||||
max_experts_per_worker=e,
|
|
||||||
quant_config=quant_config,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
with set_current_vllm_config(
|
with set_current_vllm_config(
|
||||||
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
|
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
|
||||||
):
|
):
|
||||||
return kernel(
|
return cutlass_moe_fp4(
|
||||||
hidden_states=a,
|
a=a,
|
||||||
w1=w1_fp4,
|
w1_fp4=w1_fp4,
|
||||||
w2=w2_fp4,
|
w2_fp4=w2_fp4,
|
||||||
topk_weights=topk_weights,
|
topk_weights=topk_weights,
|
||||||
topk_ids=topk_ids,
|
topk_ids=topk_ids,
|
||||||
|
m=m,
|
||||||
|
n=n,
|
||||||
|
k=k,
|
||||||
|
e=num_experts,
|
||||||
|
quant_config=quant_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
def run_triton_from_graph(
|
def run_triton_from_graph(
|
||||||
@@ -457,10 +441,6 @@ def bench_run(
|
|||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
# Initialize workspace manager (required for CUTLASS MoE kernels)
|
|
||||||
device = torch.device("cuda:0")
|
|
||||||
init_workspace_manager(device)
|
|
||||||
|
|
||||||
print("Benchmarking models:")
|
print("Benchmarking models:")
|
||||||
for i, model in enumerate(args.models):
|
for i, model in enumerate(args.models):
|
||||||
print(f"[{i}] {model}")
|
print(f"[{i}] {model}")
|
||||||
@@ -6,19 +6,15 @@ kernel. Both kernels take in fp8 quantized weights and 16-bit activations,
|
|||||||
but use different quantization strategies and backends.
|
but use different quantization strategies and backends.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import nvtx
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
|
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
|
||||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
|
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
|
||||||
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
|
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
|
||||||
from vllm.model_executor.layers.fused_moe.prepare_finalize import (
|
|
||||||
MoEPrepareAndFinalizeNoEP,
|
|
||||||
)
|
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
from vllm.v1.worker.workspace import init_workspace_manager
|
|
||||||
|
|
||||||
# Weight shapes for different models: [num_experts, topk, hidden_size,
|
# Weight shapes for different models: [num_experts, topk, hidden_size,
|
||||||
# intermediate_size]
|
# intermediate_size]
|
||||||
@@ -62,7 +58,6 @@ def bench_run(
|
|||||||
per_out_ch: bool,
|
per_out_ch: bool,
|
||||||
mkn: tuple[int, int, int],
|
mkn: tuple[int, int, int],
|
||||||
):
|
):
|
||||||
init_workspace_manager(torch.cuda.current_device())
|
|
||||||
(m, k, n) = mkn
|
(m, k, n) = mkn
|
||||||
|
|
||||||
dtype = torch.half
|
dtype = torch.half
|
||||||
@@ -125,6 +120,85 @@ def bench_run(
|
|||||||
# Force per-tensor quantization for all cases
|
# Force per-tensor quantization for all cases
|
||||||
per_act_token = False
|
per_act_token = False
|
||||||
|
|
||||||
|
# Create stride tensors for CUTLASS
|
||||||
|
ab_strides1 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
|
||||||
|
ab_strides2 = torch.full((num_experts,), n, dtype=torch.int64, device=device)
|
||||||
|
c_strides1 = torch.full((num_experts,), 2 * n, dtype=torch.int64, device=device)
|
||||||
|
c_strides2 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
|
||||||
|
|
||||||
|
def run_triton_moe(
|
||||||
|
a: torch.Tensor,
|
||||||
|
w1: torch.Tensor,
|
||||||
|
w2: torch.Tensor,
|
||||||
|
topk_weights: torch.Tensor,
|
||||||
|
topk_ids: torch.Tensor,
|
||||||
|
w1_scale: torch.Tensor,
|
||||||
|
w2_scale: torch.Tensor,
|
||||||
|
a1_scale: torch.Tensor,
|
||||||
|
a2_scale: torch.Tensor,
|
||||||
|
num_repeats: int,
|
||||||
|
):
|
||||||
|
quant_config = fp8_w8a8_moe_quant_config(
|
||||||
|
w1_scale=w1_scale,
|
||||||
|
w2_scale=w2_scale,
|
||||||
|
a1_scale=a1_scale,
|
||||||
|
a2_scale=a2_scale,
|
||||||
|
per_act_token_quant=per_act_token,
|
||||||
|
per_out_ch_quant=per_out_ch,
|
||||||
|
)
|
||||||
|
|
||||||
|
for _ in range(num_repeats):
|
||||||
|
fused_experts(
|
||||||
|
a,
|
||||||
|
w1,
|
||||||
|
w2,
|
||||||
|
topk_weights,
|
||||||
|
topk_ids,
|
||||||
|
quant_config=quant_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
def run_cutlass_moe_fp8(
|
||||||
|
a: torch.Tensor,
|
||||||
|
w1: torch.Tensor,
|
||||||
|
w2: torch.Tensor,
|
||||||
|
topk_weights: torch.Tensor,
|
||||||
|
topk_ids: torch.Tensor,
|
||||||
|
ab_strides1: torch.Tensor,
|
||||||
|
ab_strides2: torch.Tensor,
|
||||||
|
c_strides1: torch.Tensor,
|
||||||
|
c_strides2: torch.Tensor,
|
||||||
|
w1_scale: torch.Tensor,
|
||||||
|
w2_scale: torch.Tensor,
|
||||||
|
a1_scale: torch.Tensor,
|
||||||
|
a2_scale: torch.Tensor,
|
||||||
|
num_repeats: int,
|
||||||
|
):
|
||||||
|
quant_config = fp8_w8a8_moe_quant_config(
|
||||||
|
w1_scale=w1_scale,
|
||||||
|
w2_scale=w2_scale,
|
||||||
|
a1_scale=a1_scale,
|
||||||
|
a2_scale=a2_scale,
|
||||||
|
per_act_token_quant=per_act_token,
|
||||||
|
per_out_ch_quant=per_out_ch,
|
||||||
|
)
|
||||||
|
|
||||||
|
for _ in range(num_repeats):
|
||||||
|
with nvtx.annotate("cutlass_moe_fp8", color="blue"):
|
||||||
|
cutlass_moe_fp8(
|
||||||
|
a=a,
|
||||||
|
w1_q=w1,
|
||||||
|
w2_q=w2,
|
||||||
|
topk_weights=topk_weights,
|
||||||
|
topk_ids=topk_ids,
|
||||||
|
ab_strides1=ab_strides1,
|
||||||
|
ab_strides2=ab_strides2,
|
||||||
|
c_strides1=c_strides1,
|
||||||
|
c_strides2=c_strides2,
|
||||||
|
quant_config=quant_config,
|
||||||
|
activation="silu",
|
||||||
|
global_num_experts=num_experts,
|
||||||
|
)
|
||||||
|
|
||||||
# Pre-create quantization config to avoid creating it inside CUDA graph
|
# Pre-create quantization config to avoid creating it inside CUDA graph
|
||||||
quant_config = fp8_w8a8_moe_quant_config(
|
quant_config = fp8_w8a8_moe_quant_config(
|
||||||
w1_scale=w1_scale,
|
w1_scale=w1_scale,
|
||||||
@@ -135,30 +209,23 @@ def bench_run(
|
|||||||
per_out_ch_quant=per_out_ch,
|
per_out_ch_quant=per_out_ch,
|
||||||
)
|
)
|
||||||
|
|
||||||
fn = mk.FusedMoEModularKernel(
|
|
||||||
MoEPrepareAndFinalizeNoEP(),
|
|
||||||
CutlassExpertsFp8(
|
|
||||||
out_dtype=a.dtype,
|
|
||||||
e=num_experts,
|
|
||||||
n=n,
|
|
||||||
k=k,
|
|
||||||
quant_config=quant_config,
|
|
||||||
device=w1.device,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create CUDA graphs for CUTLASS (match benchmark_moe.py pattern exactly)
|
# Create CUDA graphs for CUTLASS (match benchmark_moe.py pattern exactly)
|
||||||
cutlass_stream = torch.cuda.Stream()
|
cutlass_stream = torch.cuda.Stream()
|
||||||
cutlass_graph = torch.cuda.CUDAGraph()
|
cutlass_graph = torch.cuda.CUDAGraph()
|
||||||
with torch.cuda.graph(cutlass_graph, stream=cutlass_stream):
|
with torch.cuda.graph(cutlass_graph, stream=cutlass_stream):
|
||||||
# Capture 10 invocations like benchmark_moe.py
|
# Capture 10 invocations like benchmark_moe.py
|
||||||
for _ in range(10):
|
for _ in range(10):
|
||||||
fn(
|
cutlass_moe_fp8(
|
||||||
a,
|
a=a,
|
||||||
w1_fp8q_cutlass,
|
w1_q=w1_fp8q_cutlass,
|
||||||
w2_fp8q_cutlass,
|
w2_q=w2_fp8q_cutlass,
|
||||||
topk_weights,
|
topk_weights=topk_weights,
|
||||||
topk_ids,
|
topk_ids=topk_ids,
|
||||||
|
ab_strides1=ab_strides1,
|
||||||
|
ab_strides2=ab_strides2,
|
||||||
|
c_strides1=c_strides1,
|
||||||
|
c_strides2=c_strides2,
|
||||||
|
quant_config=quant_config,
|
||||||
activation="silu",
|
activation="silu",
|
||||||
global_num_experts=num_experts,
|
global_num_experts=num_experts,
|
||||||
)
|
)
|
||||||
@@ -230,10 +297,6 @@ def bench_run(
|
|||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
# Initialize workspace manager (required for CUTLASS MoE kernels)
|
|
||||||
device = torch.device("cuda:0")
|
|
||||||
init_workspace_manager(device)
|
|
||||||
|
|
||||||
print("Benchmarking models:")
|
print("Benchmarking models:")
|
||||||
for i, model in enumerate(args.models):
|
for i, model in enumerate(args.models):
|
||||||
print(f"[{i}] {model}")
|
print(f"[{i}] {model}")
|
||||||
|
|||||||
@@ -293,7 +293,7 @@ class CommunicatorBenchmark:
|
|||||||
graph = torch.cuda.CUDAGraph()
|
graph = torch.cuda.CUDAGraph()
|
||||||
graph_pool = torch.cuda.graph_pool_handle()
|
graph_pool = torch.cuda.graph_pool_handle()
|
||||||
set_graph_pool_id(graph_pool)
|
set_graph_pool_id(graph_pool)
|
||||||
with torch.cuda.graph(graph, pool=graph_pool, stream=stream):
|
with torch.cuda.graph(graph, pool=graph_pool):
|
||||||
for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
|
for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
|
||||||
allreduce_fn(graph_input)
|
allreduce_fn(graph_input)
|
||||||
|
|
||||||
|
|||||||
@@ -5,20 +5,15 @@ import torch
|
|||||||
import torch.utils.benchmark as benchmark
|
import torch.utils.benchmark as benchmark
|
||||||
from benchmark_shapes import WEIGHT_SHAPES_MOE
|
from benchmark_shapes import WEIGHT_SHAPES_MOE
|
||||||
|
|
||||||
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
||||||
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
|
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
|
||||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
|
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
|
||||||
from vllm.model_executor.layers.fused_moe.fused_moe import (
|
from vllm.model_executor.layers.fused_moe.fused_moe import (
|
||||||
fused_experts,
|
fused_experts,
|
||||||
fused_topk,
|
fused_topk,
|
||||||
)
|
)
|
||||||
from vllm.model_executor.layers.fused_moe.prepare_finalize import (
|
|
||||||
MoEPrepareAndFinalizeNoEP,
|
|
||||||
)
|
|
||||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
from vllm.v1.worker.workspace import init_workspace_manager
|
|
||||||
|
|
||||||
DEFAULT_MODELS = [
|
DEFAULT_MODELS = [
|
||||||
"mistralai/Mixtral-8x7B-Instruct-v0.1",
|
"mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
@@ -49,7 +44,6 @@ def bench_run(
|
|||||||
per_out_ch: bool,
|
per_out_ch: bool,
|
||||||
mkn: tuple[int, int, int],
|
mkn: tuple[int, int, int],
|
||||||
):
|
):
|
||||||
init_workspace_manager(torch.cuda.current_device())
|
|
||||||
label = "Quant Matmul"
|
label = "Quant Matmul"
|
||||||
|
|
||||||
sub_label = (
|
sub_label = (
|
||||||
@@ -87,6 +81,11 @@ def bench_run(
|
|||||||
a, score, topk, renormalize=False
|
a, score, topk, renormalize=False
|
||||||
)
|
)
|
||||||
|
|
||||||
|
ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
|
||||||
|
ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64)
|
||||||
|
c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64)
|
||||||
|
c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
|
||||||
|
|
||||||
def run_triton_moe(
|
def run_triton_moe(
|
||||||
a: torch.Tensor,
|
a: torch.Tensor,
|
||||||
w1: torch.Tensor,
|
w1: torch.Tensor,
|
||||||
@@ -120,6 +119,10 @@ def bench_run(
|
|||||||
w2: torch.Tensor,
|
w2: torch.Tensor,
|
||||||
w1_scale: torch.Tensor,
|
w1_scale: torch.Tensor,
|
||||||
w2_scale: torch.Tensor,
|
w2_scale: torch.Tensor,
|
||||||
|
ab_strides1: torch.Tensor,
|
||||||
|
ab_strides2: torch.Tensor,
|
||||||
|
c_strides1: torch.Tensor,
|
||||||
|
c_strides2: torch.Tensor,
|
||||||
topk_weights: torch.Tensor,
|
topk_weights: torch.Tensor,
|
||||||
topk_ids: torch.Tensor,
|
topk_ids: torch.Tensor,
|
||||||
per_act_token: bool,
|
per_act_token: bool,
|
||||||
@@ -131,29 +134,31 @@ def bench_run(
|
|||||||
per_act_token_quant=per_act_token,
|
per_act_token_quant=per_act_token,
|
||||||
)
|
)
|
||||||
|
|
||||||
fn = mk.FusedMoEModularKernel(
|
|
||||||
MoEPrepareAndFinalizeNoEP(),
|
|
||||||
CutlassExpertsFp8(
|
|
||||||
out_dtype=a.dtype,
|
|
||||||
# NOTE(rob): w2 is shaped as [E, hidden, intermediate]
|
|
||||||
e=w2.shape[0],
|
|
||||||
n=w2.shape[2],
|
|
||||||
k=w2.shape[1],
|
|
||||||
quant_config=quant_config,
|
|
||||||
device=w1.device,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
for _ in range(num_repeats):
|
for _ in range(num_repeats):
|
||||||
fn(a, w1, w2, topk_weights, topk_ids)
|
cutlass_moe_fp8(
|
||||||
|
a,
|
||||||
|
w1,
|
||||||
|
w2,
|
||||||
|
topk_weights,
|
||||||
|
topk_ids,
|
||||||
|
ab_strides1,
|
||||||
|
ab_strides2,
|
||||||
|
c_strides1,
|
||||||
|
c_strides2,
|
||||||
|
quant_config=quant_config,
|
||||||
|
)
|
||||||
|
|
||||||
def run_cutlass_from_graph(
|
def run_cutlass_from_graph(
|
||||||
a: torch.Tensor,
|
a: torch.Tensor,
|
||||||
a_scale: torch.Tensor,
|
a_scale: torch.Tensor,
|
||||||
w1: torch.Tensor,
|
w1_q: torch.Tensor,
|
||||||
w2: torch.Tensor,
|
w2_q: torch.Tensor,
|
||||||
w1_scale: torch.Tensor,
|
w1_scale: torch.Tensor,
|
||||||
w2_scale: torch.Tensor,
|
w2_scale: torch.Tensor,
|
||||||
|
ab_strides1: torch.Tensor,
|
||||||
|
ab_strides2: torch.Tensor,
|
||||||
|
c_strides1: torch.Tensor,
|
||||||
|
c_strides2: torch.Tensor,
|
||||||
topk_weights: torch.Tensor,
|
topk_weights: torch.Tensor,
|
||||||
topk_ids: torch.Tensor,
|
topk_ids: torch.Tensor,
|
||||||
):
|
):
|
||||||
@@ -163,23 +168,21 @@ def bench_run(
|
|||||||
per_act_token_quant=per_act_token,
|
per_act_token_quant=per_act_token,
|
||||||
)
|
)
|
||||||
|
|
||||||
fn = mk.FusedMoEModularKernel(
|
|
||||||
MoEPrepareAndFinalizeNoEP(),
|
|
||||||
CutlassExpertsFp8(
|
|
||||||
out_dtype=a.dtype,
|
|
||||||
# NOTE(rob): w2 is shaped as [E, hidden, intermediate]
|
|
||||||
e=w2.shape[0],
|
|
||||||
n=w2.shape[2],
|
|
||||||
k=w2.shape[1],
|
|
||||||
quant_config=quant_config,
|
|
||||||
device=w1.device,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
with set_current_vllm_config(
|
with set_current_vllm_config(
|
||||||
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
|
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
|
||||||
):
|
):
|
||||||
return fn(a, w1, w2, topk_weights, topk_ids)
|
return cutlass_moe_fp8(
|
||||||
|
a,
|
||||||
|
w1_q,
|
||||||
|
w2_q,
|
||||||
|
topk_weights,
|
||||||
|
topk_ids,
|
||||||
|
ab_strides1,
|
||||||
|
ab_strides2,
|
||||||
|
c_strides1,
|
||||||
|
c_strides2,
|
||||||
|
quant_config=quant_config,
|
||||||
|
)
|
||||||
|
|
||||||
def run_triton_from_graph(
|
def run_triton_from_graph(
|
||||||
a: torch.Tensor,
|
a: torch.Tensor,
|
||||||
@@ -223,6 +226,10 @@ def bench_run(
|
|||||||
w2_q,
|
w2_q,
|
||||||
w1_scale,
|
w1_scale,
|
||||||
w2_scale,
|
w2_scale,
|
||||||
|
ab_strides1,
|
||||||
|
ab_strides2,
|
||||||
|
c_strides1,
|
||||||
|
c_strides2,
|
||||||
topk_weights,
|
topk_weights,
|
||||||
topk_ids,
|
topk_ids,
|
||||||
)
|
)
|
||||||
@@ -260,6 +267,10 @@ def bench_run(
|
|||||||
"w1_scale": w1_scale,
|
"w1_scale": w1_scale,
|
||||||
"w2_scale": w2_scale,
|
"w2_scale": w2_scale,
|
||||||
"per_act_token": per_act_token,
|
"per_act_token": per_act_token,
|
||||||
|
"ab_strides1": ab_strides1,
|
||||||
|
"ab_strides2": ab_strides2,
|
||||||
|
"c_strides1": c_strides1,
|
||||||
|
"c_strides2": c_strides2,
|
||||||
# cuda graph params
|
# cuda graph params
|
||||||
"cutlass_graph": cutlass_graph,
|
"cutlass_graph": cutlass_graph,
|
||||||
"triton_graph": triton_graph,
|
"triton_graph": triton_graph,
|
||||||
@@ -318,6 +329,10 @@ def bench_run(
|
|||||||
w2_q,
|
w2_q,
|
||||||
w1_scale,
|
w1_scale,
|
||||||
w2_scale,
|
w2_scale,
|
||||||
|
ab_strides1,
|
||||||
|
ab_strides2,
|
||||||
|
c_strides1,
|
||||||
|
c_strides2,
|
||||||
topk_weights,
|
topk_weights,
|
||||||
topk_ids,
|
topk_ids,
|
||||||
per_act_token,
|
per_act_token,
|
||||||
@@ -326,7 +341,7 @@ def bench_run(
|
|||||||
|
|
||||||
results.append(
|
results.append(
|
||||||
benchmark.Timer(
|
benchmark.Timer(
|
||||||
stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, per_act_token, num_runs)", # noqa: E501
|
stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, ab_strides1, ab_strides2, c_strides1, c_strides2, topk_weights, topk_ids, per_act_token, num_runs)", # noqa: E501
|
||||||
globals=globals,
|
globals=globals,
|
||||||
label=label,
|
label=label,
|
||||||
sub_label=sub_label,
|
sub_label=sub_label,
|
||||||
@@ -349,10 +364,6 @@ def bench_run(
|
|||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
# Initialize workspace manager (required for CUTLASS MoE kernels)
|
|
||||||
device = torch.device("cuda:0")
|
|
||||||
init_workspace_manager(device)
|
|
||||||
|
|
||||||
print("Benchmarking models:")
|
print("Benchmarking models:")
|
||||||
for i, model in enumerate(args.models):
|
for i, model in enumerate(args.models):
|
||||||
print(f"[{i}] {model}")
|
print(f"[{i}] {model}")
|
||||||
|
|||||||
@@ -6,8 +6,9 @@ import time
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||||
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
|
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||||
|
|
||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
@@ -21,7 +22,7 @@ def main(
|
|||||||
num_warmup_iters: int = 5,
|
num_warmup_iters: int = 5,
|
||||||
num_iters: int = 100,
|
num_iters: int = 100,
|
||||||
) -> None:
|
) -> None:
|
||||||
set_random_seed(seed)
|
current_platform.seed_everything(seed)
|
||||||
torch.set_default_device("cuda")
|
torch.set_default_device("cuda")
|
||||||
|
|
||||||
layer = RMSNorm(hidden_size).to(dtype=dtype)
|
layer = RMSNorm(hidden_size).to(dtype=dtype)
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import gc
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
@@ -24,48 +23,9 @@ from vllm.platforms import current_platform
|
|||||||
from vllm.transformers_utils.config import get_config
|
from vllm.transformers_utils.config import get_config
|
||||||
from vllm.triton_utils import triton
|
from vllm.triton_utils import triton
|
||||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
from vllm.utils.torch_utils import set_random_seed
|
|
||||||
|
|
||||||
FP8_DTYPE = current_platform.fp8_dtype()
|
FP8_DTYPE = current_platform.fp8_dtype()
|
||||||
|
|
||||||
# Default interval for clearing Triton JIT cache during tuning
|
|
||||||
# Set to 0 to disable automatic cache clearing
|
|
||||||
_CACHE_CLEAR_INTERVAL_ENV = "VLLM_MOE_TUNE_CACHE_CLEAR_INTERVAL"
|
|
||||||
TRITON_CACHE_CLEAR_INTERVAL = int(os.environ.get(_CACHE_CLEAR_INTERVAL_ENV, "50"))
|
|
||||||
|
|
||||||
|
|
||||||
def clear_triton_cache():
|
|
||||||
"""Clear Triton JIT compilation cache and Python/CUDA memory.
|
|
||||||
|
|
||||||
This helps prevent OOM during tuning with large models (many experts).
|
|
||||||
"""
|
|
||||||
# Force Python garbage collection
|
|
||||||
gc.collect()
|
|
||||||
|
|
||||||
# Clear CUDA memory cache
|
|
||||||
if torch.cuda.is_available():
|
|
||||||
torch.cuda.empty_cache()
|
|
||||||
|
|
||||||
# Try to clear Triton's runtime cache
|
|
||||||
try:
|
|
||||||
if (
|
|
||||||
hasattr(triton, "runtime")
|
|
||||||
and hasattr(triton.runtime, "cache")
|
|
||||||
and hasattr(triton.runtime.cache, "clear")
|
|
||||||
):
|
|
||||||
triton.runtime.cache.clear()
|
|
||||||
except ImportError:
|
|
||||||
# Triton not installed, skip cache clearing
|
|
||||||
pass
|
|
||||||
except AttributeError:
|
|
||||||
# Triton version doesn't have expected cache API
|
|
||||||
pass
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Warning: Failed to clear Triton cache: {e}")
|
|
||||||
|
|
||||||
# Additional garbage collection after clearing caches
|
|
||||||
gc.collect()
|
|
||||||
|
|
||||||
|
|
||||||
def ensure_divisibility(numerator, denominator, text):
|
def ensure_divisibility(numerator, denominator, text):
|
||||||
"""Ensure that numerator is divisible by the denominator."""
|
"""Ensure that numerator is divisible by the denominator."""
|
||||||
@@ -430,7 +390,7 @@ def merge_unique_dicts(list1, list2):
|
|||||||
class BenchmarkWorker:
|
class BenchmarkWorker:
|
||||||
def __init__(self, seed: int) -> None:
|
def __init__(self, seed: int) -> None:
|
||||||
torch.set_default_device("cuda")
|
torch.set_default_device("cuda")
|
||||||
set_random_seed(seed)
|
current_platform.seed_everything(seed)
|
||||||
self.seed = seed
|
self.seed = seed
|
||||||
# Get the device ID to allocate tensors and kernels
|
# Get the device ID to allocate tensors and kernels
|
||||||
# on the respective GPU. This is required for Ray to work
|
# on the respective GPU. This is required for Ray to work
|
||||||
@@ -450,7 +410,7 @@ class BenchmarkWorker:
|
|||||||
block_quant_shape: list[int] = None,
|
block_quant_shape: list[int] = None,
|
||||||
use_deep_gemm: bool = False,
|
use_deep_gemm: bool = False,
|
||||||
) -> tuple[dict[str, int], float]:
|
) -> tuple[dict[str, int], float]:
|
||||||
set_random_seed(self.seed)
|
current_platform.seed_everything(self.seed)
|
||||||
dtype_str = _get_config_dtype_str(
|
dtype_str = _get_config_dtype_str(
|
||||||
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
|
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
|
||||||
)
|
)
|
||||||
@@ -523,7 +483,7 @@ class BenchmarkWorker:
|
|||||||
need_device_guard = True
|
need_device_guard = True
|
||||||
|
|
||||||
with torch.cuda.device(self.device_id) if need_device_guard else nullcontext():
|
with torch.cuda.device(self.device_id) if need_device_guard else nullcontext():
|
||||||
for idx, config in enumerate(tqdm(search_space)):
|
for config in tqdm(search_space):
|
||||||
try:
|
try:
|
||||||
kernel_time = benchmark_config(
|
kernel_time = benchmark_config(
|
||||||
config,
|
config,
|
||||||
@@ -546,19 +506,6 @@ class BenchmarkWorker:
|
|||||||
if kernel_time < best_time:
|
if kernel_time < best_time:
|
||||||
best_time = kernel_time
|
best_time = kernel_time
|
||||||
best_config = config
|
best_config = config
|
||||||
|
|
||||||
# Periodically clear Triton JIT cache to prevent OOM
|
|
||||||
# This is especially important for large models with many experts
|
|
||||||
if (
|
|
||||||
TRITON_CACHE_CLEAR_INTERVAL > 0
|
|
||||||
and idx > 0
|
|
||||||
and idx % TRITON_CACHE_CLEAR_INTERVAL == 0
|
|
||||||
):
|
|
||||||
clear_triton_cache()
|
|
||||||
|
|
||||||
# Final cleanup after tuning completes
|
|
||||||
clear_triton_cache()
|
|
||||||
|
|
||||||
now = datetime.now()
|
now = datetime.now()
|
||||||
print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
|
print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
|
||||||
assert best_config is not None
|
assert best_config is not None
|
||||||
|
|||||||
@@ -18,7 +18,6 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
|
|||||||
from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
|
from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
from vllm.utils.torch_utils import set_random_seed
|
|
||||||
|
|
||||||
FP8_DTYPE = current_platform.fp8_dtype()
|
FP8_DTYPE = current_platform.fp8_dtype()
|
||||||
|
|
||||||
@@ -262,7 +261,7 @@ def benchmark_unpermute(
|
|||||||
class BenchmarkWorker:
|
class BenchmarkWorker:
|
||||||
def __init__(self, seed: int) -> None:
|
def __init__(self, seed: int) -> None:
|
||||||
torch.set_default_device("cuda")
|
torch.set_default_device("cuda")
|
||||||
set_random_seed(seed)
|
current_platform.seed_everything(seed)
|
||||||
self.seed = seed
|
self.seed = seed
|
||||||
# Get the device ID to allocate tensors and kernels
|
# Get the device ID to allocate tensors and kernels
|
||||||
# on the respective GPU. This is required for Ray to work
|
# on the respective GPU. This is required for Ray to work
|
||||||
@@ -280,7 +279,7 @@ class BenchmarkWorker:
|
|||||||
use_int8_w8a16: bool,
|
use_int8_w8a16: bool,
|
||||||
use_customized_permute: bool = False,
|
use_customized_permute: bool = False,
|
||||||
) -> tuple[dict[str, int], float]:
|
) -> tuple[dict[str, int], float]:
|
||||||
set_random_seed(self.seed)
|
current_platform.seed_everything(self.seed)
|
||||||
|
|
||||||
permute_time = benchmark_permute(
|
permute_time = benchmark_permute(
|
||||||
num_tokens,
|
num_tokens,
|
||||||
|
|||||||
@@ -37,9 +37,9 @@ import numpy as np
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||||
|
from vllm.platforms import current_platform
|
||||||
from vllm.transformers_utils.config import get_config
|
from vllm.transformers_utils.config import get_config
|
||||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
from vllm.utils.torch_utils import set_random_seed
|
|
||||||
|
|
||||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
|
||||||
@@ -94,7 +94,7 @@ def benchmark_mrope(
|
|||||||
benchmark_iter: int = 100,
|
benchmark_iter: int = 100,
|
||||||
csv_writer=None,
|
csv_writer=None,
|
||||||
):
|
):
|
||||||
set_random_seed(seed)
|
current_platform.seed_everything(seed)
|
||||||
torch.set_default_device(device)
|
torch.set_default_device(device)
|
||||||
# the parameters to compute the q k v size based on tp_size
|
# the parameters to compute the q k v size based on tp_size
|
||||||
mrope_helper_class = get_rope(
|
mrope_helper_class = get_rope(
|
||||||
|
|||||||
@@ -13,7 +13,6 @@ from vllm.utils.argparse_utils import FlexibleArgumentParser
|
|||||||
from vllm.utils.torch_utils import (
|
from vllm.utils.torch_utils import (
|
||||||
STR_DTYPE_TO_TORCH_DTYPE,
|
STR_DTYPE_TO_TORCH_DTYPE,
|
||||||
create_kv_caches_with_random,
|
create_kv_caches_with_random,
|
||||||
set_random_seed,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
@@ -39,7 +38,7 @@ def main(
|
|||||||
device: str = "cuda",
|
device: str = "cuda",
|
||||||
kv_cache_dtype: str | None = None,
|
kv_cache_dtype: str | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
set_random_seed(seed)
|
current_platform.seed_everything(seed)
|
||||||
|
|
||||||
scale = float(1.0 / (head_size**0.5))
|
scale = float(1.0 / (head_size**0.5))
|
||||||
query = torch.empty(
|
query = torch.empty(
|
||||||
|
|||||||
@@ -6,8 +6,9 @@ import time
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
|
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||||
|
|
||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
@@ -22,7 +23,7 @@ def main(
|
|||||||
num_warmup_iters: int = 5,
|
num_warmup_iters: int = 5,
|
||||||
num_iters: int = 100,
|
num_iters: int = 100,
|
||||||
) -> None:
|
) -> None:
|
||||||
set_random_seed(seed)
|
current_platform.seed_everything(seed)
|
||||||
torch.set_default_device("cuda")
|
torch.set_default_device("cuda")
|
||||||
|
|
||||||
x = torch.randn(num_tokens, hidden_size, dtype=dtype)
|
x = torch.randn(num_tokens, hidden_size, dtype=dtype)
|
||||||
|
|||||||
@@ -8,11 +8,11 @@ from tabulate import tabulate
|
|||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
from vllm.utils.torch_utils import (
|
from vllm.utils.torch_utils import (
|
||||||
STR_DTYPE_TO_TORCH_DTYPE,
|
STR_DTYPE_TO_TORCH_DTYPE,
|
||||||
create_kv_caches_with_random,
|
create_kv_caches_with_random,
|
||||||
set_random_seed,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
@@ -36,7 +36,7 @@ def run_benchmark(
|
|||||||
if kv_cache_dtype == "fp8" and head_size % 16:
|
if kv_cache_dtype == "fp8" and head_size % 16:
|
||||||
raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
|
raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
|
||||||
|
|
||||||
set_random_seed(42)
|
current_platform.seed_everything(42)
|
||||||
torch.set_default_device(device)
|
torch.set_default_device(device)
|
||||||
|
|
||||||
# create random key / value tensors [T, H, D].
|
# create random key / value tensors [T, H, D].
|
||||||
|
|||||||
@@ -7,15 +7,15 @@ import torch
|
|||||||
from tabulate import tabulate
|
from tabulate import tabulate
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
|
from vllm.attention.ops.triton_reshape_and_cache_flash import (
|
||||||
|
triton_reshape_and_cache_flash,
|
||||||
|
)
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
from vllm.utils.torch_utils import (
|
from vllm.utils.torch_utils import (
|
||||||
STR_DTYPE_TO_TORCH_DTYPE,
|
STR_DTYPE_TO_TORCH_DTYPE,
|
||||||
create_kv_caches_with_random_flash,
|
create_kv_caches_with_random_flash,
|
||||||
set_random_seed,
|
|
||||||
)
|
|
||||||
from vllm.v1.attention.ops.triton_reshape_and_cache_flash import (
|
|
||||||
triton_reshape_and_cache_flash,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
@@ -49,7 +49,7 @@ def run_benchmark(
|
|||||||
if implementation == "triton" and kv_cache_layout == "HND":
|
if implementation == "triton" and kv_cache_layout == "HND":
|
||||||
return float("nan") # Triton does not support HND layout yet.
|
return float("nan") # Triton does not support HND layout yet.
|
||||||
|
|
||||||
set_random_seed(42)
|
current_platform.seed_everything(42)
|
||||||
torch.set_default_device(device)
|
torch.set_default_device(device)
|
||||||
|
|
||||||
# create random key / value tensors [T, H, D].
|
# create random key / value tensors [T, H, D].
|
||||||
|
|||||||
@@ -23,9 +23,9 @@ import torch
|
|||||||
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
|
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
|
||||||
persistent_masked_m_silu_mul_quant,
|
persistent_masked_m_silu_mul_quant,
|
||||||
)
|
)
|
||||||
|
from vllm.platforms import current_platform
|
||||||
from vllm.triton_utils import tl, triton
|
from vllm.triton_utils import tl, triton
|
||||||
from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
|
from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
|
||||||
from vllm.utils.torch_utils import set_random_seed
|
|
||||||
|
|
||||||
|
|
||||||
@triton.jit
|
@triton.jit
|
||||||
@@ -207,7 +207,7 @@ def benchmark(
|
|||||||
):
|
):
|
||||||
def generate_data(seed_offset=0):
|
def generate_data(seed_offset=0):
|
||||||
"""Generate input data with given seed offset"""
|
"""Generate input data with given seed offset"""
|
||||||
set_random_seed(42 + seed_offset)
|
current_platform.seed_everything(42 + seed_offset)
|
||||||
y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous()
|
y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous()
|
||||||
|
|
||||||
if gen_strategy == "random_imbalanced":
|
if gen_strategy == "random_imbalanced":
|
||||||
|
|||||||
@@ -1,272 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import functools
|
|
||||||
import time
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
|
|
||||||
from vllm._custom_ops import (
|
|
||||||
cpu_attention_with_kv_cache,
|
|
||||||
cpu_attn_get_scheduler_metadata,
|
|
||||||
cpu_attn_reshape_and_cache,
|
|
||||||
)
|
|
||||||
from vllm.platforms import CpuArchEnum, current_platform
|
|
||||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
|
||||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
|
||||||
from vllm.v1.attention.backends.cpu_attn import CPUAttentionBackend, _get_attn_isa
|
|
||||||
|
|
||||||
|
|
||||||
def get_attn_isa(
|
|
||||||
block_size: int | None = None,
|
|
||||||
dtype: torch.dtype | None = None,
|
|
||||||
):
|
|
||||||
if block_size and dtype:
|
|
||||||
return _get_attn_isa(dtype, block_size)
|
|
||||||
else:
|
|
||||||
if current_platform.get_cpu_architecture() == CpuArchEnum.ARM:
|
|
||||||
return "neon"
|
|
||||||
elif torch._C._cpu._is_amx_tile_supported():
|
|
||||||
return "amx"
|
|
||||||
else:
|
|
||||||
return "vec"
|
|
||||||
|
|
||||||
|
|
||||||
# rand number generation takes too much time, cache rand tensors
|
|
||||||
@functools.lru_cache(maxsize=128, typed=False)
|
|
||||||
def tensor_cache(
|
|
||||||
elem_num: int,
|
|
||||||
dtype: torch.dtype,
|
|
||||||
) -> torch.Tensor:
|
|
||||||
tensor = torch.randn(elem_num, dtype=dtype)
|
|
||||||
return tensor
|
|
||||||
|
|
||||||
|
|
||||||
@torch.inference_mode()
|
|
||||||
def main(
|
|
||||||
seq_lens: list[tuple[int, int]],
|
|
||||||
num_heads: tuple[int, int],
|
|
||||||
head_size: int,
|
|
||||||
sliding_window: int = None,
|
|
||||||
dtype: torch.dtype = torch.bfloat16,
|
|
||||||
block_size: int = 128,
|
|
||||||
num_blocks: int = 4096,
|
|
||||||
use_sink: bool = False,
|
|
||||||
enable_kv_split: bool = False,
|
|
||||||
isa: str | None = None,
|
|
||||||
seed: int = 0,
|
|
||||||
iters: int = 20,
|
|
||||||
) -> None:
|
|
||||||
current_platform.seed_everything(seed)
|
|
||||||
num_seqs = len(seq_lens)
|
|
||||||
query_lens = [x[0] for x in seq_lens]
|
|
||||||
kv_lens = [x[1] for x in seq_lens]
|
|
||||||
num_query_heads = num_heads[0]
|
|
||||||
num_kv_heads = num_heads[1]
|
|
||||||
assert num_query_heads % num_kv_heads == 0
|
|
||||||
max_kv_len = max(kv_lens)
|
|
||||||
window_size = (sliding_window - 1, 0) if sliding_window is not None else (-1, -1)
|
|
||||||
scale = head_size**-0.5
|
|
||||||
token_num = sum(query_lens)
|
|
||||||
|
|
||||||
if isa is None:
|
|
||||||
isa = get_attn_isa(block_size, dtype)
|
|
||||||
|
|
||||||
s_aux = (
|
|
||||||
15 * torch.rand((num_query_heads,), dtype=torch.bfloat16) if use_sink else None
|
|
||||||
)
|
|
||||||
|
|
||||||
query = tensor_cache(
|
|
||||||
elem_num=token_num * num_query_heads * head_size,
|
|
||||||
dtype=dtype,
|
|
||||||
)
|
|
||||||
query = query.view(
|
|
||||||
token_num,
|
|
||||||
num_query_heads,
|
|
||||||
head_size,
|
|
||||||
)
|
|
||||||
|
|
||||||
key_value = tensor_cache(
|
|
||||||
elem_num=2 * num_blocks * num_kv_heads * block_size * head_size,
|
|
||||||
dtype=dtype,
|
|
||||||
)
|
|
||||||
key_value = key_value.view(
|
|
||||||
2,
|
|
||||||
num_blocks,
|
|
||||||
block_size,
|
|
||||||
num_kv_heads,
|
|
||||||
head_size,
|
|
||||||
)
|
|
||||||
key_cache, value_cache = key_value.unbind(0)
|
|
||||||
|
|
||||||
# KV cache for CPU attention
|
|
||||||
packed_key_cache = torch.empty(
|
|
||||||
num_blocks, num_kv_heads, block_size, head_size, dtype=dtype
|
|
||||||
)
|
|
||||||
packed_value_cache = torch.empty_like(packed_key_cache)
|
|
||||||
|
|
||||||
cu_query_lens = torch.tensor([0] + query_lens, dtype=torch.int32).cumsum(
|
|
||||||
dim=0, dtype=torch.int32
|
|
||||||
)
|
|
||||||
kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int32)
|
|
||||||
max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
|
|
||||||
block_tables = torch.randint(
|
|
||||||
0, num_blocks, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
|
|
||||||
)
|
|
||||||
|
|
||||||
# use reshape_and_cache to pack key_cache and value_cache
|
|
||||||
slot_mapping = torch.arange(0, num_blocks * block_size, dtype=torch.int64)
|
|
||||||
cpu_attn_reshape_and_cache(
|
|
||||||
key=key_cache.view(-1, num_kv_heads, head_size),
|
|
||||||
value=value_cache.view(-1, num_kv_heads, head_size),
|
|
||||||
key_cache=packed_key_cache,
|
|
||||||
value_cache=packed_value_cache,
|
|
||||||
slot_mapping=slot_mapping,
|
|
||||||
isa=isa,
|
|
||||||
)
|
|
||||||
|
|
||||||
metadata = cpu_attn_get_scheduler_metadata(
|
|
||||||
num_reqs=num_seqs,
|
|
||||||
num_heads=num_query_heads,
|
|
||||||
num_kv_heads=num_kv_heads,
|
|
||||||
head_dim=head_size,
|
|
||||||
seq_lens=kv_lens_tensor,
|
|
||||||
dtype=dtype,
|
|
||||||
query_start_loc=cu_query_lens,
|
|
||||||
causal=True,
|
|
||||||
sliding_window_size=sliding_window if sliding_window is not None else -1,
|
|
||||||
isa=isa,
|
|
||||||
enable_kv_split=enable_kv_split,
|
|
||||||
)
|
|
||||||
|
|
||||||
out_with_split = torch.empty_like(query)
|
|
||||||
|
|
||||||
def run_benchmark(iters: int) -> list[float]:
|
|
||||||
times = []
|
|
||||||
for _ in range(iters):
|
|
||||||
start_time = time.perf_counter_ns()
|
|
||||||
cpu_attention_with_kv_cache(
|
|
||||||
query=query,
|
|
||||||
key_cache=packed_key_cache,
|
|
||||||
value_cache=packed_value_cache,
|
|
||||||
output=out_with_split,
|
|
||||||
query_start_loc=cu_query_lens,
|
|
||||||
seq_lens=kv_lens_tensor,
|
|
||||||
scale=scale,
|
|
||||||
causal=True,
|
|
||||||
alibi_slopes=None,
|
|
||||||
sliding_window=window_size,
|
|
||||||
block_table=block_tables,
|
|
||||||
softcap=0,
|
|
||||||
scheduler_metadata=metadata,
|
|
||||||
s_aux=s_aux,
|
|
||||||
)
|
|
||||||
end_time = time.perf_counter_ns()
|
|
||||||
times.append((end_time - start_time) / 1e6)
|
|
||||||
return times
|
|
||||||
|
|
||||||
# warmup
|
|
||||||
run_benchmark(5)
|
|
||||||
# benchmark
|
|
||||||
times = run_benchmark(iters)
|
|
||||||
|
|
||||||
time_min = min(times)
|
|
||||||
time_max = max(times)
|
|
||||||
time_mean = np.mean(times)
|
|
||||||
time_std = np.std(times)
|
|
||||||
|
|
||||||
print("\tmin (ms) = ", time_min)
|
|
||||||
print("\tmax (ms) = ", time_max)
|
|
||||||
print("\tmean (ms) = ", time_mean)
|
|
||||||
print("\tstd = ", time_std)
|
|
||||||
print("\tmedian (ms) = ", np.median(times))
|
|
||||||
|
|
||||||
|
|
||||||
def generate_seq_lens(
|
|
||||||
batch_size: int,
|
|
||||||
q_len_min: int,
|
|
||||||
q_len_max: int,
|
|
||||||
kv_len_min: int,
|
|
||||||
kv_len_max: int,
|
|
||||||
seed: int = 0,
|
|
||||||
) -> list[tuple[int, int]]:
|
|
||||||
assert 1 <= q_len_min <= q_len_max
|
|
||||||
assert 1 <= kv_len_min <= kv_len_max
|
|
||||||
assert kv_len_max >= q_len_min
|
|
||||||
|
|
||||||
g = torch.Generator(device="cpu").manual_seed(seed)
|
|
||||||
|
|
||||||
def rint(lo: int, hi: int) -> int:
|
|
||||||
return torch.randint(lo, hi + 1, (1,), generator=g).item()
|
|
||||||
|
|
||||||
seq_lens: list[tuple[int, int]] = []
|
|
||||||
for _ in range(batch_size):
|
|
||||||
# ensure q <= kv
|
|
||||||
kv = rint(max(kv_len_min, q_len_min), kv_len_max)
|
|
||||||
q = rint(q_len_min, min(q_len_max, kv))
|
|
||||||
seq_lens.append((q, kv))
|
|
||||||
|
|
||||||
return seq_lens
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = FlexibleArgumentParser(description="Benchmark the paged attention kernel.")
|
|
||||||
parser.add_argument("--batch-size", type=int, default=64)
|
|
||||||
parser.add_argument("--q-len-min", type=int, default=512)
|
|
||||||
parser.add_argument("--q-len-max", type=int, default=512)
|
|
||||||
parser.add_argument("--kv-len-min", type=int, default=512)
|
|
||||||
parser.add_argument("--kv-len-max", type=int, default=512)
|
|
||||||
parser.add_argument("--num-blocks", type=int, default=4096)
|
|
||||||
|
|
||||||
parser.add_argument("--sliding-window", type=int, default=None)
|
|
||||||
parser.add_argument("--num-query-heads", type=int, default=32)
|
|
||||||
parser.add_argument("--num-kv-heads", type=int, default=8)
|
|
||||||
parser.add_argument(
|
|
||||||
"--head-size",
|
|
||||||
type=int,
|
|
||||||
choices=CPUAttentionBackend.get_supported_head_sizes(),
|
|
||||||
default=128,
|
|
||||||
)
|
|
||||||
parser.add_argument("--enable-kv-split", action="store_true")
|
|
||||||
parser.add_argument("--block-size", type=int, choices=[32, 64, 128], default=128)
|
|
||||||
parser.add_argument(
|
|
||||||
"--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16"
|
|
||||||
)
|
|
||||||
parser.add_argument("--use-sink", action="store_true")
|
|
||||||
parser.add_argument(
|
|
||||||
"--isa", type=str, choices=["vec", "neon", "amx", "vec16"], default=None
|
|
||||||
)
|
|
||||||
parser.add_argument("--seed", type=int, default=0)
|
|
||||||
parser.add_argument("--iters", type=int, default=20)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
print(args)
|
|
||||||
|
|
||||||
seq_lens = generate_seq_lens(
|
|
||||||
args.batch_size,
|
|
||||||
args.q_len_min,
|
|
||||||
args.q_len_max,
|
|
||||||
args.kv_len_min,
|
|
||||||
args.kv_len_max,
|
|
||||||
args.seed,
|
|
||||||
)
|
|
||||||
|
|
||||||
print("batch (query len, kv len) = ", seq_lens)
|
|
||||||
|
|
||||||
main(
|
|
||||||
seq_lens=seq_lens,
|
|
||||||
num_heads=(args.num_query_heads, args.num_kv_heads),
|
|
||||||
head_size=args.head_size,
|
|
||||||
sliding_window=args.sliding_window,
|
|
||||||
dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
|
|
||||||
block_size=args.block_size,
|
|
||||||
num_blocks=args.num_blocks,
|
|
||||||
use_sink=args.use_sink,
|
|
||||||
enable_kv_split=args.enable_kv_split,
|
|
||||||
isa=args.isa
|
|
||||||
if args.isa is not None
|
|
||||||
else get_attn_isa(args.block_size, STR_DTYPE_TO_TORCH_DTYPE[args.dtype]),
|
|
||||||
seed=args.seed,
|
|
||||||
iters=args.iters,
|
|
||||||
)
|
|
||||||
@@ -1,175 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import sys
|
|
||||||
import time
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
|
|
||||||
from vllm.platforms import current_platform
|
|
||||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
|
||||||
|
|
||||||
# Check if CPU MoE operations are available
|
|
||||||
try:
|
|
||||||
from vllm._custom_ops import cpu_fused_moe, cpu_prepack_moe_weight
|
|
||||||
except (ImportError, AttributeError) as e:
|
|
||||||
print("ERROR: CPU fused MoE operations are not available on this platform.")
|
|
||||||
print("This benchmark requires x86 CPU with proper vLLM CPU extensions compiled.")
|
|
||||||
print(
|
|
||||||
"The cpu_fused_moe kernel is typically available on Linux x86_64 "
|
|
||||||
"with AVX2/AVX512."
|
|
||||||
)
|
|
||||||
print(f"Import error: {e}")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# ISA selection following test_cpu_fused_moe.py pattern
|
|
||||||
ISA_CHOICES = ["amx", "vec"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
|
|
||||||
|
|
||||||
|
|
||||||
@torch.inference_mode()
|
|
||||||
def main(
|
|
||||||
batch_size: int,
|
|
||||||
expert_num: int,
|
|
||||||
hidden_size: int,
|
|
||||||
intermediate_size: int,
|
|
||||||
topk_num: int,
|
|
||||||
use_bias: bool = False,
|
|
||||||
dtype: torch.dtype = torch.bfloat16,
|
|
||||||
activation: str = "silu",
|
|
||||||
isa: str = "vec",
|
|
||||||
seed: int = 0,
|
|
||||||
iters: int = 20,
|
|
||||||
) -> None:
|
|
||||||
current_platform.seed_everything(seed)
|
|
||||||
# up_dim = 2 * intermediate_size for gate + up projection
|
|
||||||
up_dim = 2 * intermediate_size
|
|
||||||
|
|
||||||
input_tensor = torch.randn((batch_size, hidden_size), dtype=dtype) / (
|
|
||||||
0.5 * hidden_size**0.5
|
|
||||||
)
|
|
||||||
|
|
||||||
w13 = torch.randn((expert_num, up_dim, hidden_size), dtype=dtype) / (
|
|
||||||
0.5 * hidden_size**0.5
|
|
||||||
)
|
|
||||||
w2 = torch.randn((expert_num, hidden_size, intermediate_size), dtype=dtype) / (
|
|
||||||
0.5 * intermediate_size**0.5
|
|
||||||
)
|
|
||||||
|
|
||||||
w13_bias = None
|
|
||||||
w2_bias = None
|
|
||||||
if use_bias:
|
|
||||||
w13_bias = torch.randn((expert_num, up_dim), dtype=dtype) / (0.5 * up_dim**0.5)
|
|
||||||
w2_bias = torch.randn((expert_num, hidden_size), dtype=dtype) / (
|
|
||||||
0.5 * hidden_size**0.5
|
|
||||||
)
|
|
||||||
|
|
||||||
router_logits = torch.randn((batch_size, expert_num), dtype=dtype)
|
|
||||||
score = torch.softmax(router_logits, dim=-1, dtype=torch.float32)
|
|
||||||
topk_weights, topk_ids = torch.topk(score, topk_num)
|
|
||||||
topk_ids = topk_ids.to(torch.int32)
|
|
||||||
|
|
||||||
packed_w13 = cpu_prepack_moe_weight(w13, isa)
|
|
||||||
packed_w2 = cpu_prepack_moe_weight(w2, isa)
|
|
||||||
|
|
||||||
def run_benchmark(iters: int) -> list[float]:
|
|
||||||
times = []
|
|
||||||
for _ in range(iters):
|
|
||||||
start_time = time.perf_counter_ns()
|
|
||||||
_ = cpu_fused_moe(
|
|
||||||
input_tensor,
|
|
||||||
packed_w13,
|
|
||||||
packed_w2,
|
|
||||||
w13_bias,
|
|
||||||
w2_bias,
|
|
||||||
topk_weights,
|
|
||||||
topk_ids,
|
|
||||||
activation,
|
|
||||||
isa,
|
|
||||||
)
|
|
||||||
end_time = time.perf_counter_ns()
|
|
||||||
times.append((end_time - start_time) / 1e6)
|
|
||||||
return times
|
|
||||||
|
|
||||||
# warmup
|
|
||||||
run_benchmark(5)
|
|
||||||
# benchmark
|
|
||||||
times = run_benchmark(iters)
|
|
||||||
|
|
||||||
if not times:
|
|
||||||
print("No iterations to measure. Set --iters > 0.")
|
|
||||||
return
|
|
||||||
|
|
||||||
time_min = min(times)
|
|
||||||
time_max = max(times)
|
|
||||||
time_mean = np.mean(times)
|
|
||||||
time_std = np.std(times)
|
|
||||||
|
|
||||||
print("\tmin (ms) = ", time_min)
|
|
||||||
print("\tmax (ms) = ", time_max)
|
|
||||||
print("\tmean (ms) = ", time_mean)
|
|
||||||
print("\tstd = ", time_std)
|
|
||||||
print("\tmedian (ms) = ", np.median(times))
|
|
||||||
|
|
||||||
# Calculate throughput metrics
|
|
||||||
# FLOPs estimation: 2 * batch * topk * (hidden * up_dim + intermediate * hidden)
|
|
||||||
flops_per_token = (
|
|
||||||
2 * topk_num * (hidden_size * up_dim + intermediate_size * hidden_size)
|
|
||||||
)
|
|
||||||
total_flops = batch_size * flops_per_token
|
|
||||||
tflops = total_flops / (time_mean * 1e-3) / 1e12
|
|
||||||
print(f"\tthroughput (TFLOP/s) = {tflops:.4f}")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = FlexibleArgumentParser(description="Benchmark the CPU fused MoE kernel.")
|
|
||||||
parser.add_argument("--batch-size", type=int, default=64)
|
|
||||||
parser.add_argument("--expert-num", type=int, default=8)
|
|
||||||
parser.add_argument("--hidden-size", type=int, default=2880)
|
|
||||||
parser.add_argument("--intermediate-size", type=int, default=2880)
|
|
||||||
parser.add_argument(
|
|
||||||
"--topk-num",
|
|
||||||
type=int,
|
|
||||||
default=None,
|
|
||||||
help="Number of experts to route each token to (default: expert_num // 2)",
|
|
||||||
)
|
|
||||||
parser.add_argument("--use-bias", action="store_true")
|
|
||||||
parser.add_argument(
|
|
||||||
"--activation",
|
|
||||||
type=str,
|
|
||||||
choices=["silu", "swigluoai"],
|
|
||||||
default="silu",
|
|
||||||
help="Activation function",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--isa",
|
|
||||||
type=str,
|
|
||||||
choices=ISA_CHOICES,
|
|
||||||
default=ISA_CHOICES[0],
|
|
||||||
help=f"ISA to use (available: {ISA_CHOICES})",
|
|
||||||
)
|
|
||||||
parser.add_argument("--seed", type=int, default=0)
|
|
||||||
parser.add_argument("--iters", type=int, default=20)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
# Default topk_num to expert_num // 2, minimum 1
|
|
||||||
topk_num = (
|
|
||||||
args.topk_num if args.topk_num is not None else max(args.expert_num // 2, 1)
|
|
||||||
)
|
|
||||||
|
|
||||||
print(args)
|
|
||||||
|
|
||||||
main(
|
|
||||||
batch_size=args.batch_size,
|
|
||||||
expert_num=args.expert_num,
|
|
||||||
hidden_size=args.hidden_size,
|
|
||||||
intermediate_size=args.intermediate_size,
|
|
||||||
topk_num=topk_num,
|
|
||||||
use_bias=args.use_bias,
|
|
||||||
dtype=torch.bfloat16, # Following test_cpu_fused_moe.py
|
|
||||||
activation=args.activation,
|
|
||||||
isa=args.isa,
|
|
||||||
seed=args.seed,
|
|
||||||
iters=args.iters,
|
|
||||||
)
|
|
||||||
@@ -330,7 +330,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
|
|||||||
PUBLIC ${oneDNN_BINARY_DIR}/include
|
PUBLIC ${oneDNN_BINARY_DIR}/include
|
||||||
PRIVATE ${oneDNN_SOURCE_DIR}/src
|
PRIVATE ${oneDNN_SOURCE_DIR}/src
|
||||||
)
|
)
|
||||||
target_link_libraries(dnnl_ext dnnl torch)
|
target_link_libraries(dnnl_ext dnnl)
|
||||||
target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS} -fPIC)
|
target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS} -fPIC)
|
||||||
list(APPEND LIBS dnnl_ext)
|
list(APPEND LIBS dnnl_ext)
|
||||||
set(USE_ONEDNN ON)
|
set(USE_ONEDNN ON)
|
||||||
@@ -358,13 +358,13 @@ set(VLLM_EXT_SRC
|
|||||||
"csrc/cpu/pos_encoding.cpp"
|
"csrc/cpu/pos_encoding.cpp"
|
||||||
"csrc/moe/dynamic_4bit_int_moe_cpu.cpp"
|
"csrc/moe/dynamic_4bit_int_moe_cpu.cpp"
|
||||||
"csrc/cpu/cpu_attn.cpp"
|
"csrc/cpu/cpu_attn.cpp"
|
||||||
|
"csrc/cpu/scratchpad_manager.cpp"
|
||||||
"csrc/cpu/torch_bindings.cpp")
|
"csrc/cpu/torch_bindings.cpp")
|
||||||
|
|
||||||
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||||
set(VLLM_EXT_SRC
|
set(VLLM_EXT_SRC
|
||||||
"csrc/cpu/shm.cpp"
|
"csrc/cpu/shm.cpp"
|
||||||
"csrc/cpu/cpu_wna16.cpp"
|
"csrc/cpu/cpu_wna16.cpp"
|
||||||
"csrc/cpu/cpu_fused_moe.cpp"
|
|
||||||
${VLLM_EXT_SRC})
|
${VLLM_EXT_SRC})
|
||||||
if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI)
|
if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI)
|
||||||
set(VLLM_EXT_SRC
|
set(VLLM_EXT_SRC
|
||||||
|
|||||||
@@ -35,21 +35,16 @@ message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
|
|||||||
# sm90a
|
# sm90a
|
||||||
|
|
||||||
set(SUPPORT_ARCHS)
|
set(SUPPORT_ARCHS)
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3)
|
||||||
list(APPEND SUPPORT_ARCHS "9.0a")
|
list(APPEND SUPPORT_ARCHS 9.0a)
|
||||||
endif()
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.9)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8)
|
||||||
# CUDA 12.9 has introduced "Family-Specific Architecture Features"
|
list(APPEND SUPPORT_ARCHS 10.0a)
|
||||||
# this supports all compute_10x family
|
|
||||||
list(APPEND SUPPORT_ARCHS "10.0f")
|
|
||||||
elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
|
||||||
list(APPEND SUPPORT_ARCHS "10.0a")
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
||||||
cuda_archs_loose_intersection(FLASH_MLA_ARCHS "${SUPPORT_ARCHS}" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(FLASH_MLA_ARCHS "${SUPPORT_ARCHS}" "${CUDA_ARCHS}")
|
||||||
if(FLASH_MLA_ARCHS)
|
if(FLASH_MLA_ARCHS)
|
||||||
message(STATUS "FlashMLA CUDA architectures: ${FLASH_MLA_ARCHS}")
|
|
||||||
set(VLLM_FLASHMLA_GPU_FLAGS ${VLLM_GPU_FLAGS})
|
set(VLLM_FLASHMLA_GPU_FLAGS ${VLLM_GPU_FLAGS})
|
||||||
list(APPEND VLLM_FLASHMLA_GPU_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math")
|
list(APPEND VLLM_FLASHMLA_GPU_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math")
|
||||||
|
|
||||||
@@ -131,8 +126,7 @@ if(FLASH_MLA_ARCHS)
|
|||||||
$<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
|
$<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
|
||||||
$<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
|
$<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
|
||||||
else()
|
else()
|
||||||
message(STATUS "FlashMLA will not compile: unsupported CUDA architecture ${CUDA_ARCHS}")
|
# Create empty targets for setup.py when not targeting sm90a systems
|
||||||
# Create empty targets for setup.py on unsupported systems
|
|
||||||
add_custom_target(_flashmla_C)
|
add_custom_target(_flashmla_C)
|
||||||
add_custom_target(_flashmla_extension_C)
|
add_custom_target(_flashmla_extension_C)
|
||||||
endif()
|
endif()
|
||||||
|
|||||||
@@ -31,15 +31,10 @@ if(NOT qutlass_SOURCE_DIR)
|
|||||||
endif()
|
endif()
|
||||||
message(STATUS "[QUTLASS] QuTLASS is available at ${qutlass_SOURCE_DIR}")
|
message(STATUS "[QUTLASS] QuTLASS is available at ${qutlass_SOURCE_DIR}")
|
||||||
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0a" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0f" "${CUDA_ARCHS}")
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND QUTLASS_ARCHS)
|
||||||
else()
|
|
||||||
cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0a;10.3a" "${CUDA_ARCHS}")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND QUTLASS_ARCHS)
|
if(QUTLASS_ARCHS MATCHES "10\\.0a")
|
||||||
|
|
||||||
if(QUTLASS_ARCHS MATCHES "10\\.(0a|3a|0f)")
|
|
||||||
set(QUTLASS_TARGET_CC 100)
|
set(QUTLASS_TARGET_CC 100)
|
||||||
elseif(QUTLASS_ARCHS MATCHES "12\\.0a")
|
elseif(QUTLASS_ARCHS MATCHES "12\\.0a")
|
||||||
set(QUTLASS_TARGET_CC 120)
|
set(QUTLASS_TARGET_CC 120)
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ else()
|
|||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
vllm-flash-attn
|
vllm-flash-attn
|
||||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||||
GIT_TAG 188be16520ceefdc625fdf71365585d2ee348fe2
|
GIT_TAG 86f8f157cf82aa2342743752b97788922dd7de43
|
||||||
GIT_PROGRESS TRUE
|
GIT_PROGRESS TRUE
|
||||||
# Don't share the vllm-flash-attn build between build types
|
# Don't share the vllm-flash-attn build between build types
|
||||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||||
|
|||||||
@@ -15,61 +15,19 @@ __device__ __forceinline__ scalar_t compute(const scalar_t& x,
|
|||||||
const scalar_t& y) {
|
const scalar_t& y) {
|
||||||
return act_first ? ACT_FN(x) * y : x * ACT_FN(y);
|
return act_first ? ACT_FN(x) * y : x * ACT_FN(y);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if all pointers are 16-byte aligned for int4 vectorized access
|
|
||||||
__device__ __forceinline__ bool is_16byte_aligned(const void* ptr) {
|
|
||||||
return (reinterpret_cast<uintptr_t>(ptr) & 15) == 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Activation and gating kernel template.
|
// Activation and gating kernel template.
|
||||||
|
|
||||||
template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
|
template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
|
||||||
bool act_first>
|
bool act_first>
|
||||||
__global__ void act_and_mul_kernel(
|
__global__ void act_and_mul_kernel(
|
||||||
scalar_t* __restrict__ out, // [..., d]
|
scalar_t* __restrict__ out, // [..., d]
|
||||||
const scalar_t* __restrict__ input, // [..., 2, d]
|
const scalar_t* __restrict__ input, // [..., 2, d]
|
||||||
const int d) {
|
const int d) {
|
||||||
constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
|
|
||||||
const int64_t token_idx = blockIdx.x;
|
const int64_t token_idx = blockIdx.x;
|
||||||
const scalar_t* x_ptr = input + token_idx * 2 * d;
|
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
|
||||||
const scalar_t* y_ptr = x_ptr + d;
|
const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
|
||||||
scalar_t* out_ptr = out + token_idx * d;
|
const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
|
||||||
|
out[token_idx * d + idx] = compute<scalar_t, ACT_FN, act_first>(x, y);
|
||||||
// Check alignment for 128-bit vectorized access.
|
|
||||||
// All three pointers must be 16-byte aligned for safe int4 operations.
|
|
||||||
const bool aligned = is_16byte_aligned(x_ptr) && is_16byte_aligned(y_ptr) &&
|
|
||||||
is_16byte_aligned(out_ptr);
|
|
||||||
|
|
||||||
if (aligned && d >= VEC_SIZE) {
|
|
||||||
// Fast path: 128-bit vectorized loop
|
|
||||||
const int4* x_vec = reinterpret_cast<const int4*>(x_ptr);
|
|
||||||
const int4* y_vec = reinterpret_cast<const int4*>(y_ptr);
|
|
||||||
int4* out_vec = reinterpret_cast<int4*>(out_ptr);
|
|
||||||
const int num_vecs = d / VEC_SIZE;
|
|
||||||
const int vec_end = num_vecs * VEC_SIZE;
|
|
||||||
|
|
||||||
for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
|
|
||||||
int4 x = VLLM_LDG(&x_vec[i]), y = VLLM_LDG(&y_vec[i]), r;
|
|
||||||
auto* xp = reinterpret_cast<scalar_t*>(&x);
|
|
||||||
auto* yp = reinterpret_cast<scalar_t*>(&y);
|
|
||||||
auto* rp = reinterpret_cast<scalar_t*>(&r);
|
|
||||||
#pragma unroll
|
|
||||||
for (int j = 0; j < VEC_SIZE; j++) {
|
|
||||||
rp[j] = compute<scalar_t, ACT_FN, act_first>(xp[j], yp[j]);
|
|
||||||
}
|
|
||||||
out_vec[i] = r;
|
|
||||||
}
|
|
||||||
// Scalar cleanup for remaining elements
|
|
||||||
for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
|
|
||||||
out_ptr[i] = compute<scalar_t, ACT_FN, act_first>(VLLM_LDG(&x_ptr[i]),
|
|
||||||
VLLM_LDG(&y_ptr[i]));
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Scalar fallback for unaligned data or small d
|
|
||||||
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
|
|
||||||
const scalar_t x = VLLM_LDG(&x_ptr[idx]);
|
|
||||||
const scalar_t y = VLLM_LDG(&y_ptr[idx]);
|
|
||||||
out_ptr[idx] = compute<scalar_t, ACT_FN, act_first>(x, y);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -162,115 +120,50 @@ template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&, const float)>
|
|||||||
__global__ void act_and_mul_kernel_with_param(
|
__global__ void act_and_mul_kernel_with_param(
|
||||||
scalar_t* __restrict__ out, const scalar_t* __restrict__ input, const int d,
|
scalar_t* __restrict__ out, const scalar_t* __restrict__ input, const int d,
|
||||||
const float param) {
|
const float param) {
|
||||||
constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
|
|
||||||
const int64_t token_idx = blockIdx.x;
|
const int64_t token_idx = blockIdx.x;
|
||||||
const scalar_t* x_ptr = input + token_idx * 2 * d;
|
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
|
||||||
const scalar_t* y_ptr = x_ptr + d;
|
const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
|
||||||
scalar_t* out_ptr = out + token_idx * d;
|
const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
|
||||||
|
out[token_idx * d + idx] = ACT_FN(x, param) * y;
|
||||||
// Check alignment for 128-bit vectorized access
|
|
||||||
const bool aligned = is_16byte_aligned(x_ptr) && is_16byte_aligned(y_ptr) &&
|
|
||||||
is_16byte_aligned(out_ptr);
|
|
||||||
|
|
||||||
if (aligned && d >= VEC_SIZE) {
|
|
||||||
// Fast path: 128-bit vectorized loop
|
|
||||||
const int4* x_vec = reinterpret_cast<const int4*>(x_ptr);
|
|
||||||
const int4* y_vec = reinterpret_cast<const int4*>(y_ptr);
|
|
||||||
int4* out_vec = reinterpret_cast<int4*>(out_ptr);
|
|
||||||
const int num_vecs = d / VEC_SIZE;
|
|
||||||
const int vec_end = num_vecs * VEC_SIZE;
|
|
||||||
|
|
||||||
for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
|
|
||||||
int4 x = VLLM_LDG(&x_vec[i]), y = VLLM_LDG(&y_vec[i]), r;
|
|
||||||
auto* xp = reinterpret_cast<scalar_t*>(&x);
|
|
||||||
auto* yp = reinterpret_cast<scalar_t*>(&y);
|
|
||||||
auto* rp = reinterpret_cast<scalar_t*>(&r);
|
|
||||||
#pragma unroll
|
|
||||||
for (int j = 0; j < VEC_SIZE; j++) {
|
|
||||||
rp[j] = ACT_FN(xp[j], param) * yp[j];
|
|
||||||
}
|
|
||||||
out_vec[i] = r;
|
|
||||||
}
|
|
||||||
// Scalar cleanup for remaining elements
|
|
||||||
for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
|
|
||||||
out_ptr[i] = ACT_FN(VLLM_LDG(&x_ptr[i]), param) * VLLM_LDG(&y_ptr[i]);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Scalar fallback for unaligned data or small d
|
|
||||||
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
|
|
||||||
const scalar_t x = VLLM_LDG(&x_ptr[idx]);
|
|
||||||
const scalar_t y = VLLM_LDG(&y_ptr[idx]);
|
|
||||||
out_ptr[idx] = ACT_FN(x, param) * y;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
__device__ __forceinline__ T swigluoai_and_mul(const T& gate, const T& up,
|
__device__ __forceinline__ T swigluoai_and_mul(const T& gate, const T& up,
|
||||||
float alpha, float limit) {
|
float alpha, float limit) {
|
||||||
// Clamp gate to (-inf, limit] and up to [-limit, limit]
|
// clamp gate: min=None, max=limit
|
||||||
const float g = fminf((float)gate, limit);
|
const float gate_f = (float)gate;
|
||||||
const float u = fmaxf(fminf((float)up, limit), -limit);
|
const float clamped_gate = gate_f > limit ? limit : gate_f;
|
||||||
// glu = gate * sigmoid(gate * alpha), then return (up + 1) * glu
|
|
||||||
return (T)((u + 1.0f) * g / (1.0f + expf(-g * alpha)));
|
// clamp up: min=-limit, max=limit
|
||||||
|
const float up_f = (float)up;
|
||||||
|
const float clamped_up =
|
||||||
|
up_f > limit ? limit : (up_f < -limit ? -limit : up_f);
|
||||||
|
|
||||||
|
// glu = gate * sigmoid(gate * alpha)
|
||||||
|
const float sigmoid_val = 1.0f / (1.0f + expf(-clamped_gate * alpha));
|
||||||
|
const float glu = clamped_gate * sigmoid_val;
|
||||||
|
|
||||||
|
// (up + 1) * glu
|
||||||
|
return (T)((clamped_up + 1.0f) * glu);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Interleaved gate/up: input has [gate0, up0, gate1, up1, ...].
|
|
||||||
template <typename scalar_t,
|
template <typename scalar_t,
|
||||||
scalar_t (*ACT_FN)(const scalar_t&, const scalar_t&, const float,
|
scalar_t (*ACT_FN)(const scalar_t&, const scalar_t&, const float,
|
||||||
const float)>
|
const float)>
|
||||||
__global__ void swigluoai_and_mul_kernel(
|
__global__ void swigluoai_and_mul_kernel(
|
||||||
scalar_t* __restrict__ out, // [..., d]
|
scalar_t* __restrict__ out, // [..., d]
|
||||||
const scalar_t* __restrict__ input, // [..., 2 * d] (interleaved)
|
const scalar_t* __restrict__ input, // [..., 2, d]
|
||||||
const int d, const float alpha, const float limit) {
|
const int d, const float alpha, const float limit) {
|
||||||
// For interleaved data: input has 2*d elements per token (gate/up pairs)
|
|
||||||
// output has d elements per token
|
|
||||||
constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
|
|
||||||
constexpr int PAIRS = VEC_SIZE / 2; // Number of gate/up pairs per int4 load
|
|
||||||
const int64_t token_idx = blockIdx.x;
|
const int64_t token_idx = blockIdx.x;
|
||||||
const scalar_t* in_ptr = input + token_idx * 2 * d;
|
// TODO: Vectorize loads and stores.
|
||||||
scalar_t* out_ptr = out + token_idx * d;
|
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
|
||||||
|
// gate = x[..., ::2] (even indices)
|
||||||
|
const scalar_t gate = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx]);
|
||||||
|
// up = x[..., 1::2] (odd indices)
|
||||||
|
const scalar_t up = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx + 1]);
|
||||||
|
|
||||||
// Check alignment for 128-bit vectorized access on input.
|
out[token_idx * d + idx] = ACT_FN(gate, up, alpha, limit);
|
||||||
// For output we use int2 (64-bit) which has 8-byte alignment requirement.
|
|
||||||
const bool in_aligned = is_16byte_aligned(in_ptr);
|
|
||||||
const bool out_aligned =
|
|
||||||
(reinterpret_cast<uintptr_t>(out_ptr) & 7) == 0; // 8-byte for int2
|
|
||||||
|
|
||||||
if (in_aligned && out_aligned && d >= PAIRS) {
|
|
||||||
// Fast path: vectorized loop
|
|
||||||
// Each int4 load gives VEC_SIZE elements = PAIRS gate/up pairs
|
|
||||||
// Each int2 store writes PAIRS output elements
|
|
||||||
const int4* in_vec = reinterpret_cast<const int4*>(in_ptr);
|
|
||||||
int2* out_vec = reinterpret_cast<int2*>(out_ptr);
|
|
||||||
const int num_vecs = d / PAIRS;
|
|
||||||
const int vec_end = num_vecs * PAIRS;
|
|
||||||
|
|
||||||
for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
|
|
||||||
int4 v = VLLM_LDG(&in_vec[i]);
|
|
||||||
int2 r;
|
|
||||||
auto* vp = reinterpret_cast<scalar_t*>(&v);
|
|
||||||
auto* rp = reinterpret_cast<scalar_t*>(&r);
|
|
||||||
#pragma unroll
|
|
||||||
for (int j = 0; j < PAIRS; j++) {
|
|
||||||
rp[j] = ACT_FN(vp[2 * j], vp[2 * j + 1], alpha, limit);
|
|
||||||
}
|
|
||||||
out_vec[i] = r;
|
|
||||||
}
|
|
||||||
// Scalar cleanup for remaining elements
|
|
||||||
for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
|
|
||||||
out_ptr[i] = ACT_FN(VLLM_LDG(&in_ptr[2 * i]),
|
|
||||||
VLLM_LDG(&in_ptr[2 * i + 1]), alpha, limit);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Scalar fallback for unaligned data or small d
|
|
||||||
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
|
|
||||||
// gate = x[..., ::2] (even indices)
|
|
||||||
const scalar_t gate = VLLM_LDG(&in_ptr[2 * idx]);
|
|
||||||
// up = x[..., 1::2] (odd indices)
|
|
||||||
const scalar_t up = VLLM_LDG(&in_ptr[2 * idx + 1]);
|
|
||||||
out_ptr[idx] = ACT_FN(gate, up, alpha, limit);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -324,41 +217,10 @@ __global__ void activation_kernel(
|
|||||||
scalar_t* __restrict__ out, // [..., d]
|
scalar_t* __restrict__ out, // [..., d]
|
||||||
const scalar_t* __restrict__ input, // [..., d]
|
const scalar_t* __restrict__ input, // [..., d]
|
||||||
const int d) {
|
const int d) {
|
||||||
constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
|
|
||||||
const int64_t token_idx = blockIdx.x;
|
const int64_t token_idx = blockIdx.x;
|
||||||
const scalar_t* in_ptr = input + token_idx * d;
|
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
|
||||||
scalar_t* out_ptr = out + token_idx * d;
|
const scalar_t x = VLLM_LDG(&input[token_idx * d + idx]);
|
||||||
|
out[token_idx * d + idx] = ACT_FN(x);
|
||||||
// Check alignment for 128-bit vectorized access
|
|
||||||
const bool aligned = is_16byte_aligned(in_ptr) && is_16byte_aligned(out_ptr);
|
|
||||||
|
|
||||||
if (aligned && d >= VEC_SIZE) {
|
|
||||||
// Fast path: 128-bit vectorized loop
|
|
||||||
const int4* in_vec = reinterpret_cast<const int4*>(in_ptr);
|
|
||||||
int4* out_vec = reinterpret_cast<int4*>(out_ptr);
|
|
||||||
const int num_vecs = d / VEC_SIZE;
|
|
||||||
const int vec_end = num_vecs * VEC_SIZE;
|
|
||||||
|
|
||||||
for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
|
|
||||||
int4 v = VLLM_LDG(&in_vec[i]), r;
|
|
||||||
auto* vp = reinterpret_cast<scalar_t*>(&v);
|
|
||||||
auto* rp = reinterpret_cast<scalar_t*>(&r);
|
|
||||||
#pragma unroll
|
|
||||||
for (int j = 0; j < VEC_SIZE; j++) {
|
|
||||||
rp[j] = ACT_FN(vp[j]);
|
|
||||||
}
|
|
||||||
out_vec[i] = r;
|
|
||||||
}
|
|
||||||
// Scalar cleanup for remaining elements
|
|
||||||
for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
|
|
||||||
out_ptr[i] = ACT_FN(VLLM_LDG(&in_ptr[i]));
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Scalar fallback for unaligned data or small d
|
|
||||||
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
|
|
||||||
const scalar_t x = VLLM_LDG(&in_ptr[idx]);
|
|
||||||
out_ptr[idx] = ACT_FN(x);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
17
csrc/cache.h
17
csrc/cache.h
@@ -9,6 +9,16 @@
|
|||||||
void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
|
void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
|
||||||
const torch::Tensor& block_mapping);
|
const torch::Tensor& block_mapping);
|
||||||
|
|
||||||
|
// Note: the key_caches and value_caches vectors are constant but
|
||||||
|
// not the Tensors they contain. The vectors need to be const refs
|
||||||
|
// in order to satisfy pytorch's C++ operator registration code.
|
||||||
|
void copy_blocks(std::vector<torch::Tensor> const& key_caches,
|
||||||
|
std::vector<torch::Tensor> const& value_caches,
|
||||||
|
const torch::Tensor& block_mapping);
|
||||||
|
|
||||||
|
void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
|
||||||
|
const torch::Tensor& block_mapping);
|
||||||
|
|
||||||
void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
|
void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
|
||||||
torch::Tensor& key_cache, torch::Tensor& value_cache,
|
torch::Tensor& key_cache, torch::Tensor& value_cache,
|
||||||
torch::Tensor& slot_mapping,
|
torch::Tensor& slot_mapping,
|
||||||
@@ -27,13 +37,6 @@ void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
|
|||||||
const std::string& kv_cache_dtype,
|
const std::string& kv_cache_dtype,
|
||||||
torch::Tensor& scale);
|
torch::Tensor& scale);
|
||||||
|
|
||||||
// NOTE: k_pe and kv_c order is flipped compared to concat_and_cache_mla
|
|
||||||
void concat_and_cache_mla_rope_fused(
|
|
||||||
torch::Tensor& positions, torch::Tensor& q_pe, torch::Tensor& k_pe,
|
|
||||||
torch::Tensor& kv_c, torch::Tensor& rope_cos_sin_cache, bool rope_is_neox,
|
|
||||||
torch::Tensor& kv_cache_slot_mapping, torch::Tensor& kv_cache,
|
|
||||||
const std::string& kv_cache_dtype, torch::Tensor& kv_cache_quant_scale);
|
|
||||||
|
|
||||||
// Just for unittest
|
// Just for unittest
|
||||||
void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
|
void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
|
||||||
const double scale, const std::string& kv_cache_dtype);
|
const double scale, const std::string& kv_cache_dtype);
|
||||||
|
|||||||
@@ -119,6 +119,94 @@ __global__ void copy_blocks_mla_kernel(
|
|||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|
||||||
|
// Note: the key_caches and value_caches vectors are constant but
|
||||||
|
// not the Tensors they contain. The vectors need to be const refs
|
||||||
|
// in order to satisfy pytorch's C++ operator registration code.
|
||||||
|
void copy_blocks(std::vector<torch::Tensor> const& key_caches,
|
||||||
|
std::vector<torch::Tensor> const& value_caches,
|
||||||
|
const torch::Tensor& block_mapping) {
|
||||||
|
int num_layers = key_caches.size();
|
||||||
|
TORCH_CHECK(num_layers == value_caches.size());
|
||||||
|
if (num_layers == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
torch::Device cache_device = key_caches[0].device();
|
||||||
|
TORCH_CHECK(cache_device.is_cuda());
|
||||||
|
|
||||||
|
// Create data structures for the kernel.
|
||||||
|
// Create an array of pointers to the key and value caches.
|
||||||
|
int64_t key_cache_ptrs[num_layers];
|
||||||
|
int64_t value_cache_ptrs[num_layers];
|
||||||
|
for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
|
||||||
|
key_cache_ptrs[layer_idx] =
|
||||||
|
reinterpret_cast<int64_t>(key_caches[layer_idx].data_ptr());
|
||||||
|
value_cache_ptrs[layer_idx] =
|
||||||
|
reinterpret_cast<int64_t>(value_caches[layer_idx].data_ptr());
|
||||||
|
}
|
||||||
|
|
||||||
|
// block_mapping is a 2D tensor with shape (num_pairs, 2).
|
||||||
|
int num_pairs = block_mapping.size(0);
|
||||||
|
|
||||||
|
// Move the data structures to the GPU.
|
||||||
|
// NOTE: This synchronizes the CPU and GPU.
|
||||||
|
torch::Tensor key_cache_ptrs_tensor =
|
||||||
|
torch::from_blob(key_cache_ptrs, {num_layers}, torch::kInt64)
|
||||||
|
.to(cache_device);
|
||||||
|
torch::Tensor value_cache_ptrs_tensor =
|
||||||
|
torch::from_blob(value_cache_ptrs, {num_layers}, torch::kInt64)
|
||||||
|
.to(cache_device);
|
||||||
|
|
||||||
|
// Launch the kernel.
|
||||||
|
const int numel_per_block = key_caches[0][0].numel();
|
||||||
|
dim3 grid(num_layers, num_pairs);
|
||||||
|
dim3 block(std::min(1024, numel_per_block));
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(cache_device);
|
||||||
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
|
||||||
|
key_caches[0].scalar_type(), "copy_blocks_kernel", ([&] {
|
||||||
|
vllm::copy_blocks_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
||||||
|
key_cache_ptrs_tensor.data_ptr<int64_t>(),
|
||||||
|
value_cache_ptrs_tensor.data_ptr<int64_t>(),
|
||||||
|
block_mapping.data_ptr<int64_t>(), numel_per_block);
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
// copy blocks kernel for MLA (assumes a joint KV-cache)
|
||||||
|
void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
|
||||||
|
const torch::Tensor& block_mapping) {
|
||||||
|
int num_layers = kv_caches.size();
|
||||||
|
if (num_layers == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
torch::Device cache_device = kv_caches[0].device();
|
||||||
|
TORCH_CHECK(cache_device.is_cuda(), "kv_cache must be on CUDA");
|
||||||
|
|
||||||
|
std::vector<int64_t> cache_ptrs(num_layers);
|
||||||
|
for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
|
||||||
|
cache_ptrs[layer_idx] =
|
||||||
|
reinterpret_cast<int64_t>(kv_caches[layer_idx].data_ptr());
|
||||||
|
}
|
||||||
|
torch::Tensor cache_ptrs_tensor =
|
||||||
|
torch::from_blob(cache_ptrs.data(), {num_layers}, torch::kInt64)
|
||||||
|
.to(cache_device);
|
||||||
|
|
||||||
|
int num_pairs = block_mapping.size(0);
|
||||||
|
// We use the stride instead of numel in case the cache is padded for memory
|
||||||
|
// alignment reasons, we assume the blocks data (inclusive of any padding)
|
||||||
|
// is contiguous in memory
|
||||||
|
int mem_footprint_per_block = kv_caches[0].stride(0);
|
||||||
|
dim3 grid(num_layers, num_pairs);
|
||||||
|
dim3 block(std::min(1024, mem_footprint_per_block));
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(cache_device);
|
||||||
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
|
||||||
|
kv_caches[0].scalar_type(), "copy_blocks_mla_kernel", ([&] {
|
||||||
|
vllm::copy_blocks_mla_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
||||||
|
cache_ptrs_tensor.data_ptr<int64_t>(),
|
||||||
|
block_mapping.data_ptr<int64_t>(), mem_footprint_per_block);
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
// Used to copy/convert one element
|
// Used to copy/convert one element
|
||||||
@@ -451,6 +539,9 @@ __global__ void indexer_k_quant_and_cache_kernel(
|
|||||||
for (int i = 0; i < VEC_SIZE; i++) {
|
for (int i = 0; i < VEC_SIZE; i++) {
|
||||||
amax = fmaxf(amax, fabsf(float(k_val_ptr[i])));
|
amax = fmaxf(amax, fabsf(float(k_val_ptr[i])));
|
||||||
}
|
}
|
||||||
|
#ifndef USE_ROCM
|
||||||
|
__syncwarp();
|
||||||
|
#endif
|
||||||
|
|
||||||
// Reduced amax
|
// Reduced amax
|
||||||
for (int mask = 16; mask > 0; mask /= 2) {
|
for (int mask = 16; mask > 0; mask /= 2) {
|
||||||
@@ -460,7 +551,9 @@ __global__ void indexer_k_quant_and_cache_kernel(
|
|||||||
amax = fmaxf(amax, __shfl_xor_sync(unsigned(-1), amax, mask));
|
amax = fmaxf(amax, __shfl_xor_sync(unsigned(-1), amax, mask));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
#ifndef USE_ROCM
|
||||||
|
__syncwarp();
|
||||||
|
#endif
|
||||||
#if defined(__gfx942__)
|
#if defined(__gfx942__)
|
||||||
float scale = fmaxf(amax, 1e-4) / 224.0f;
|
float scale = fmaxf(amax, 1e-4) / 224.0f;
|
||||||
#else
|
#else
|
||||||
|
|||||||
@@ -1,279 +0,0 @@
|
|||||||
#include <torch/all.h>
|
|
||||||
#include <ATen/cuda/CUDAContext.h>
|
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
|
||||||
|
|
||||||
#include "cuda_compat.h"
|
|
||||||
#include "dispatch_utils.h"
|
|
||||||
|
|
||||||
#include "quantization/w8a8/fp8/common.cuh"
|
|
||||||
#ifdef USE_ROCM
|
|
||||||
#include "quantization/w8a8/fp8/amd/quant_utils.cuh"
|
|
||||||
#else
|
|
||||||
#include "quantization/w8a8/fp8/nvidia/quant_utils.cuh"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef USE_ROCM
|
|
||||||
#include <hip/hip_bf16.h>
|
|
||||||
typedef __hip_bfloat16 __nv_bfloat16;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
namespace vllm {
|
|
||||||
|
|
||||||
// NOTE Be EXTRA careful with raw_kv_scalar_t, for __half and __nv_bfloat16 it's
|
|
||||||
// using u16 as the backing type.
|
|
||||||
template <typename qk_t, bool IS_NEOX, typename raw_kv_scalar_t,
|
|
||||||
typename cache_t, Fp8KVCacheDataType kv_dt>
|
|
||||||
__global__ void concat_and_cache_mla_rope_fused_kernel(
|
|
||||||
const int64_t* __restrict__ positions, // [num_tokens]
|
|
||||||
qk_t* __restrict__ q_pe, // [num_tokens, num_q_heads, rot_dim]
|
|
||||||
qk_t* __restrict__ k_pe, // [num_tokens, rot_dim]
|
|
||||||
const qk_t* __restrict__ kv_c, // [num_tokens, kv_lora_rank]
|
|
||||||
const qk_t* __restrict__ rope_cos_sin_cache, // [max_position, 2,
|
|
||||||
// rot_dim // 2]
|
|
||||||
const int rot_dim, const int64_t q_pe_stride_token,
|
|
||||||
const int64_t q_pe_stride_head, const int64_t k_pe_stride,
|
|
||||||
const int64_t kv_c_stride, const int num_q_heads,
|
|
||||||
cache_t* __restrict__ kv_cache, // [num_blocks, block_size, (kv_lora_rank +
|
|
||||||
// rot_dim)]
|
|
||||||
const int64_t* __restrict__ kv_cache_slot_mapping, // [num_tokens]
|
|
||||||
const int block_stride, const int entry_stride, const int kv_lora_rank,
|
|
||||||
const int block_size, const float* kv_cache_quant_scale) {
|
|
||||||
// Each thread block is responsible for one token.
|
|
||||||
const int64_t token_idx = blockIdx.x;
|
|
||||||
const int64_t pos = positions[token_idx];
|
|
||||||
|
|
||||||
const qk_t* cos_sin_ptr = rope_cos_sin_cache + pos * rot_dim;
|
|
||||||
|
|
||||||
const int embed_dim = rot_dim / 2;
|
|
||||||
|
|
||||||
// Q ROPE
|
|
||||||
const int nq = num_q_heads * embed_dim;
|
|
||||||
for (int i = threadIdx.x; i < nq; i += blockDim.x) {
|
|
||||||
int head_idx = i / embed_dim;
|
|
||||||
int pair_idx = i % embed_dim;
|
|
||||||
|
|
||||||
// NOTE: Would be nice to have interleaved sin/cos so we could just load
|
|
||||||
// both at the same time.
|
|
||||||
qk_t cos = VLLM_LDG(cos_sin_ptr + pair_idx);
|
|
||||||
qk_t sin = VLLM_LDG(cos_sin_ptr + pair_idx + embed_dim);
|
|
||||||
|
|
||||||
qk_t* q_pe_head_ptr =
|
|
||||||
q_pe + token_idx * q_pe_stride_token + head_idx * q_pe_stride_head;
|
|
||||||
|
|
||||||
int pair_idx_x, pair_idx_y;
|
|
||||||
if constexpr (IS_NEOX) {
|
|
||||||
// GPT-NeoX style rotary embedding.
|
|
||||||
pair_idx_x = pair_idx;
|
|
||||||
pair_idx_y = embed_dim + pair_idx;
|
|
||||||
} else {
|
|
||||||
// GPT-J style rotary embedding.
|
|
||||||
pair_idx_x = pair_idx * 2;
|
|
||||||
pair_idx_y = pair_idx * 2 + 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
qk_t x_src = q_pe_head_ptr[pair_idx_x];
|
|
||||||
qk_t y_src = q_pe_head_ptr[pair_idx_y];
|
|
||||||
|
|
||||||
qk_t x_dst = x_src * cos - y_src * sin;
|
|
||||||
qk_t y_dst = y_src * cos + x_src * sin;
|
|
||||||
|
|
||||||
q_pe_head_ptr[pair_idx_x] = x_dst;
|
|
||||||
q_pe_head_ptr[pair_idx_y] = y_dst;
|
|
||||||
}
|
|
||||||
|
|
||||||
const int64_t slot_idx = kv_cache_slot_mapping[token_idx];
|
|
||||||
const int64_t block_idx = slot_idx / block_size;
|
|
||||||
const int64_t entry_idx = slot_idx % block_size;
|
|
||||||
|
|
||||||
// NOTE: slot_idx can be -1 if the token is padded
|
|
||||||
if (slot_idx < 0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// K with 1 HEAD
|
|
||||||
for (int i = threadIdx.x; i < embed_dim; i += blockDim.x) {
|
|
||||||
int pair_idx = i;
|
|
||||||
|
|
||||||
qk_t cos = VLLM_LDG(cos_sin_ptr + pair_idx);
|
|
||||||
qk_t sin = VLLM_LDG(cos_sin_ptr + pair_idx + embed_dim);
|
|
||||||
|
|
||||||
qk_t* k_pe_head_ptr = k_pe + token_idx * k_pe_stride;
|
|
||||||
|
|
||||||
int pair_idx_x, pair_idx_y;
|
|
||||||
if constexpr (IS_NEOX) {
|
|
||||||
// GPT-NeoX style rotary embedding.
|
|
||||||
pair_idx_x = pair_idx;
|
|
||||||
pair_idx_y = embed_dim + pair_idx;
|
|
||||||
} else {
|
|
||||||
// GPT-J style rotary embedding.
|
|
||||||
pair_idx_x = pair_idx * 2;
|
|
||||||
pair_idx_y = pair_idx * 2 + 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
qk_t x_src = k_pe_head_ptr[pair_idx_x];
|
|
||||||
qk_t y_src = k_pe_head_ptr[pair_idx_y];
|
|
||||||
|
|
||||||
qk_t x_dst = x_src * cos - y_src * sin;
|
|
||||||
qk_t y_dst = y_src * cos + x_src * sin;
|
|
||||||
|
|
||||||
k_pe_head_ptr[pair_idx_x] = x_dst;
|
|
||||||
k_pe_head_ptr[pair_idx_y] = y_dst;
|
|
||||||
|
|
||||||
// NOTE Why is this monster necessary?
|
|
||||||
// When K is of type float16, the actual template replacement for
|
|
||||||
// raw_kv_scalar_t with be u16. That's why it's used at the last moment
|
|
||||||
// otherwise CUDA ALU would break.
|
|
||||||
const raw_kv_scalar_t raw_x_value =
|
|
||||||
*reinterpret_cast<const raw_kv_scalar_t*>(&x_dst);
|
|
||||||
const raw_kv_scalar_t raw_y_value =
|
|
||||||
*reinterpret_cast<const raw_kv_scalar_t*>(&y_dst);
|
|
||||||
|
|
||||||
cache_t* kv_cache_ptr = kv_cache + block_idx * block_stride +
|
|
||||||
entry_idx * entry_stride + kv_lora_rank;
|
|
||||||
|
|
||||||
// MLA Cache Store
|
|
||||||
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
|
||||||
kv_cache_ptr[pair_idx_x] = raw_x_value;
|
|
||||||
kv_cache_ptr[pair_idx_y] = raw_y_value;
|
|
||||||
} else {
|
|
||||||
kv_cache_ptr[pair_idx_x] =
|
|
||||||
fp8::scaled_convert<cache_t, raw_kv_scalar_t, kv_dt>(
|
|
||||||
raw_x_value, *kv_cache_quant_scale);
|
|
||||||
kv_cache_ptr[pair_idx_y] =
|
|
||||||
fp8::scaled_convert<cache_t, raw_kv_scalar_t, kv_dt>(
|
|
||||||
raw_y_value, *kv_cache_quant_scale);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// NOPE
|
|
||||||
for (int i = threadIdx.x; i < kv_lora_rank; i += blockDim.x) {
|
|
||||||
const qk_t* src_ptr = kv_c + token_idx * kv_c_stride + i;
|
|
||||||
const raw_kv_scalar_t src_value =
|
|
||||||
*reinterpret_cast<const raw_kv_scalar_t*>(src_ptr);
|
|
||||||
|
|
||||||
cache_t* kv_cache_ptr =
|
|
||||||
kv_cache + block_idx * block_stride + entry_idx * entry_stride;
|
|
||||||
|
|
||||||
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
|
||||||
kv_cache_ptr[i] = src_value;
|
|
||||||
} else {
|
|
||||||
kv_cache_ptr[i] = fp8::scaled_convert<cache_t, raw_kv_scalar_t, kv_dt>(
|
|
||||||
src_value, *kv_cache_quant_scale);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace vllm
|
|
||||||
|
|
||||||
#define CALL_CONCAT_AND_CACHE_MLA_ROPE_FUSED(RAW_KV_T, CACHE_T, KV_DTYPE) \
|
|
||||||
do { \
|
|
||||||
VLLM_DISPATCH_FLOATING_TYPES(q_pe.scalar_type(), "qk_scalar_type", [&] { \
|
|
||||||
using qk_t = scalar_t; \
|
|
||||||
if (rope_is_neox) { \
|
|
||||||
vllm::concat_and_cache_mla_rope_fused_kernel<qk_t, true, RAW_KV_T, \
|
|
||||||
CACHE_T, KV_DTYPE> \
|
|
||||||
<<<grid, block, 0, stream>>>( \
|
|
||||||
positions.data_ptr<int64_t>(), q_pe.data_ptr<qk_t>(), \
|
|
||||||
k_pe.data_ptr<qk_t>(), kv_c.data_ptr<qk_t>(), \
|
|
||||||
rope_cos_sin_cache.data_ptr<qk_t>(), rot_dim, \
|
|
||||||
q_pe_stride_token, q_pe_stride_head, k_pe_stride, kv_c_stride, \
|
|
||||||
num_q_heads, reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()), \
|
|
||||||
kv_cache_slot_mapping.data_ptr<int64_t>(), block_stride, \
|
|
||||||
entry_stride, kv_lora_rank, block_size, \
|
|
||||||
kv_cache_quant_scale.data_ptr<float>()); \
|
|
||||||
} else { \
|
|
||||||
vllm::concat_and_cache_mla_rope_fused_kernel<qk_t, false, RAW_KV_T, \
|
|
||||||
CACHE_T, KV_DTYPE> \
|
|
||||||
<<<grid, block, 0, stream>>>( \
|
|
||||||
positions.data_ptr<int64_t>(), q_pe.data_ptr<qk_t>(), \
|
|
||||||
k_pe.data_ptr<qk_t>(), kv_c.data_ptr<qk_t>(), \
|
|
||||||
rope_cos_sin_cache.data_ptr<qk_t>(), rot_dim, \
|
|
||||||
q_pe_stride_token, q_pe_stride_head, k_pe_stride, kv_c_stride, \
|
|
||||||
num_q_heads, reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()), \
|
|
||||||
kv_cache_slot_mapping.data_ptr<int64_t>(), block_stride, \
|
|
||||||
entry_stride, kv_lora_rank, block_size, \
|
|
||||||
kv_cache_quant_scale.data_ptr<float>()); \
|
|
||||||
} \
|
|
||||||
}); \
|
|
||||||
} while (false)
|
|
||||||
|
|
||||||
// Executes RoPE on q_pe and k_pe, then writes k_pe and kv_c in the kv cache.
|
|
||||||
// q_pe and k_pe are modified in place.
|
|
||||||
// Replaces DeepseekScalingRotaryEmbedding.self.rotary_emb and
|
|
||||||
// concat_and_cache_mla.
|
|
||||||
void concat_and_cache_mla_rope_fused(
|
|
||||||
torch::Tensor& positions, // [num_tokens]
|
|
||||||
torch::Tensor& q_pe, // [num_tokens, num_q_heads, rot_dim]
|
|
||||||
torch::Tensor& k_pe, // [num_tokens, rot_dim]
|
|
||||||
torch::Tensor& kv_c, // [num_tokens, kv_lora_rank]
|
|
||||||
torch::Tensor& rope_cos_sin_cache, // [max_position, rot_dim]
|
|
||||||
bool rope_is_neox,
|
|
||||||
torch::Tensor&
|
|
||||||
kv_cache_slot_mapping, // [num_tokens] or [num_actual_tokens]
|
|
||||||
torch::Tensor&
|
|
||||||
kv_cache, // [num_blocks, block_size, (kv_lora_rank + rot_dim)]
|
|
||||||
const std::string& kv_cache_dtype, torch::Tensor& kv_cache_quant_scale) {
|
|
||||||
const int64_t num_tokens = q_pe.size(0);
|
|
||||||
|
|
||||||
const int num_q_heads = q_pe.size(1);
|
|
||||||
const int rot_dim = q_pe.size(2);
|
|
||||||
const int kv_lora_rank = kv_c.size(1);
|
|
||||||
|
|
||||||
TORCH_CHECK(positions.size(0) >=
|
|
||||||
num_tokens); // CUDA Graphs might pad this for us
|
|
||||||
TORCH_CHECK_EQ(positions.dim(), 1);
|
|
||||||
TORCH_CHECK_EQ(positions.scalar_type(), c10::ScalarType::Long);
|
|
||||||
|
|
||||||
TORCH_CHECK_EQ(q_pe.size(0), num_tokens);
|
|
||||||
TORCH_CHECK_EQ(q_pe.size(1), num_q_heads);
|
|
||||||
TORCH_CHECK_EQ(q_pe.size(2), rot_dim);
|
|
||||||
TORCH_CHECK_EQ(q_pe.dim(), 3);
|
|
||||||
|
|
||||||
TORCH_CHECK_EQ(k_pe.size(0), num_tokens);
|
|
||||||
TORCH_CHECK_EQ(k_pe.size(1), rot_dim);
|
|
||||||
TORCH_CHECK_EQ(k_pe.dim(), 2);
|
|
||||||
TORCH_CHECK_EQ(k_pe.scalar_type(), q_pe.scalar_type());
|
|
||||||
|
|
||||||
TORCH_CHECK_EQ(kv_c.size(0), num_tokens);
|
|
||||||
TORCH_CHECK_EQ(kv_c.size(1), kv_lora_rank);
|
|
||||||
TORCH_CHECK_EQ(kv_c.dim(), 2);
|
|
||||||
TORCH_CHECK_EQ(kv_c.scalar_type(), q_pe.scalar_type());
|
|
||||||
TORCH_CHECK_EQ(kv_c.dtype(), q_pe.dtype());
|
|
||||||
|
|
||||||
TORCH_CHECK_EQ(rope_cos_sin_cache.size(1), rot_dim);
|
|
||||||
TORCH_CHECK_EQ(rope_cos_sin_cache.scalar_type(), q_pe.scalar_type());
|
|
||||||
|
|
||||||
TORCH_CHECK_EQ(kv_cache_slot_mapping.size(0), num_tokens);
|
|
||||||
TORCH_CHECK_EQ(kv_cache_slot_mapping.scalar_type(), c10::ScalarType::Long);
|
|
||||||
|
|
||||||
TORCH_CHECK_EQ(kv_cache.size(2), kv_lora_rank + rot_dim);
|
|
||||||
TORCH_CHECK_EQ(kv_cache.dim(), 3);
|
|
||||||
|
|
||||||
TORCH_CHECK_EQ(kv_cache_quant_scale.numel(), 1);
|
|
||||||
TORCH_CHECK_EQ(kv_cache_quant_scale.scalar_type(), c10::ScalarType::Float);
|
|
||||||
|
|
||||||
int64_t q_pe_stride_token = q_pe.stride(0);
|
|
||||||
int64_t q_pe_stride_head = q_pe.stride(1);
|
|
||||||
|
|
||||||
int64_t k_pe_stride = k_pe.stride(0);
|
|
||||||
int64_t kv_c_stride = kv_c.stride(0);
|
|
||||||
|
|
||||||
int block_size = kv_cache.size(1);
|
|
||||||
|
|
||||||
int block_stride = kv_cache.stride(0);
|
|
||||||
int entry_stride = kv_cache.stride(1);
|
|
||||||
|
|
||||||
int rope_block_size = std::min(num_q_heads * rot_dim / 2, 512);
|
|
||||||
int mla_block_size = kv_lora_rank;
|
|
||||||
int thread_block_size =
|
|
||||||
std::min(std::max(rope_block_size, mla_block_size), 512);
|
|
||||||
|
|
||||||
dim3 grid(num_tokens, 1, 1);
|
|
||||||
dim3 block(thread_block_size, 1, 1);
|
|
||||||
|
|
||||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(positions));
|
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
|
||||||
|
|
||||||
DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
|
|
||||||
CALL_CONCAT_AND_CACHE_MLA_ROPE_FUSED);
|
|
||||||
}
|
|
||||||
@@ -15,7 +15,6 @@
|
|||||||
|
|
||||||
#ifdef __aarch64__
|
#ifdef __aarch64__
|
||||||
#include "cpu_attn_neon.hpp"
|
#include "cpu_attn_neon.hpp"
|
||||||
// NEON requires head_dim to be a multiple of 32
|
|
||||||
#define NEON_DISPATCH(...) \
|
#define NEON_DISPATCH(...) \
|
||||||
case cpu_attention::ISA::NEON: { \
|
case cpu_attention::ISA::NEON: { \
|
||||||
using attn_impl = cpu_attention::AttentionImpl<cpu_attention::ISA::NEON, \
|
using attn_impl = cpu_attention::AttentionImpl<cpu_attention::ISA::NEON, \
|
||||||
@@ -37,9 +36,7 @@
|
|||||||
switch (HEAD_DIM) { \
|
switch (HEAD_DIM) { \
|
||||||
CPU_ATTN_DISPATCH_CASE(32, __VA_ARGS__) \
|
CPU_ATTN_DISPATCH_CASE(32, __VA_ARGS__) \
|
||||||
CPU_ATTN_DISPATCH_CASE(64, __VA_ARGS__) \
|
CPU_ATTN_DISPATCH_CASE(64, __VA_ARGS__) \
|
||||||
CPU_ATTN_DISPATCH_CASE(80, __VA_ARGS__) \
|
|
||||||
CPU_ATTN_DISPATCH_CASE(96, __VA_ARGS__) \
|
CPU_ATTN_DISPATCH_CASE(96, __VA_ARGS__) \
|
||||||
CPU_ATTN_DISPATCH_CASE(112, __VA_ARGS__) \
|
|
||||||
CPU_ATTN_DISPATCH_CASE(128, __VA_ARGS__) \
|
CPU_ATTN_DISPATCH_CASE(128, __VA_ARGS__) \
|
||||||
CPU_ATTN_DISPATCH_CASE(160, __VA_ARGS__) \
|
CPU_ATTN_DISPATCH_CASE(160, __VA_ARGS__) \
|
||||||
CPU_ATTN_DISPATCH_CASE(192, __VA_ARGS__) \
|
CPU_ATTN_DISPATCH_CASE(192, __VA_ARGS__) \
|
||||||
|
|||||||
@@ -377,7 +377,7 @@ class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
|
|||||||
const int32_t q_heads_per_kv, const int64_t q_num_stride,
|
const int32_t q_heads_per_kv, const int64_t q_num_stride,
|
||||||
const int64_t q_head_stride, const float scale) {
|
const int64_t q_head_stride, const float scale) {
|
||||||
constexpr int64_t bytes_per_head = head_dim * sizeof(scalar_t);
|
constexpr int64_t bytes_per_head = head_dim * sizeof(scalar_t);
|
||||||
// static_assert(bytes_per_head % AMX_TILE_ROW_BYTES == 0);
|
static_assert(bytes_per_head % AMX_TILE_ROW_BYTES == 0);
|
||||||
constexpr int64_t head_size_block_num = bytes_per_head / AMX_TILE_ROW_BYTES;
|
constexpr int64_t head_size_block_num = bytes_per_head / AMX_TILE_ROW_BYTES;
|
||||||
constexpr int64_t head_elem_num_pre_block =
|
constexpr int64_t head_elem_num_pre_block =
|
||||||
AMX_TILE_ROW_BYTES / sizeof(scalar_t);
|
AMX_TILE_ROW_BYTES / sizeof(scalar_t);
|
||||||
|
|||||||
@@ -8,8 +8,10 @@
|
|||||||
#include <sys/sysctl.h>
|
#include <sys/sysctl.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "cpu/cpu_arch_macros.h"
|
#include "cpu_types.hpp"
|
||||||
#include "cpu/utils.hpp"
|
#include "scratchpad_manager.h"
|
||||||
|
#include "cpu_attn_macros.h"
|
||||||
|
#include "utils.hpp"
|
||||||
|
|
||||||
namespace cpu_attention {
|
namespace cpu_attention {
|
||||||
enum class ISA { AMX, VEC, VEC16, NEON };
|
enum class ISA { AMX, VEC, VEC16, NEON };
|
||||||
@@ -376,13 +378,12 @@ class AttentionScheduler {
|
|||||||
|
|
||||||
static constexpr int32_t MaxQTileIterNum = 128;
|
static constexpr int32_t MaxQTileIterNum = 128;
|
||||||
|
|
||||||
AttentionScheduler()
|
AttentionScheduler() : available_cache_size_(get_available_l2_size()) {}
|
||||||
: available_cache_size_(cpu_utils::get_available_l2_size()) {}
|
|
||||||
|
|
||||||
torch::Tensor schedule(const ScheduleInput& input) const {
|
torch::Tensor schedule(const ScheduleInput& input) const {
|
||||||
const bool casual = input.casual;
|
const bool casual = input.casual;
|
||||||
const int32_t thread_num = omp_get_max_threads();
|
const int32_t thread_num = omp_get_max_threads();
|
||||||
const int64_t cache_size = cpu_utils::get_available_l2_size();
|
const int64_t cache_size = get_available_l2_size();
|
||||||
const int32_t max_num_q_per_iter = input.max_num_q_per_iter;
|
const int32_t max_num_q_per_iter = input.max_num_q_per_iter;
|
||||||
const int32_t kv_len_alignment = input.kv_block_alignment;
|
const int32_t kv_len_alignment = input.kv_block_alignment;
|
||||||
int32_t q_head_per_kv = input.num_heads_q / input.num_heads_kv;
|
int32_t q_head_per_kv = input.num_heads_q / input.num_heads_kv;
|
||||||
@@ -658,7 +659,7 @@ class AttentionScheduler {
|
|||||||
metadata_ptr->thread_num +
|
metadata_ptr->thread_num +
|
||||||
metadata_ptr->reduction_scratchpad_size_per_kv_head *
|
metadata_ptr->reduction_scratchpad_size_per_kv_head *
|
||||||
(use_gqa ? input.num_heads_kv : input.num_heads_q);
|
(use_gqa ? input.num_heads_kv : input.num_heads_q);
|
||||||
cpu_utils::ScratchPadManager::get_scratchpad_manager()->realloc(
|
DNNLScratchPadManager::get_dnnl_scratchpad_manager()->realloc(
|
||||||
scratchpad_size);
|
scratchpad_size);
|
||||||
|
|
||||||
// metadata_ptr->print();
|
// metadata_ptr->print();
|
||||||
@@ -666,7 +667,7 @@ class AttentionScheduler {
|
|||||||
// test out of boundary access
|
// test out of boundary access
|
||||||
// {
|
// {
|
||||||
// float* cache_ptr =
|
// float* cache_ptr =
|
||||||
// cpu_utils::ScratchPadManager::getl_scratchpad_manager()->get_data<float>();
|
// DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<float>();
|
||||||
// for (int64_t i = 0; i < scratchpad_size / sizeof(float); ++i) {
|
// for (int64_t i = 0; i < scratchpad_size / sizeof(float); ++i) {
|
||||||
// cache_ptr[i] = std::numeric_limits<float>::quiet_NaN();
|
// cache_ptr[i] = std::numeric_limits<float>::quiet_NaN();
|
||||||
// }
|
// }
|
||||||
@@ -748,6 +749,27 @@ class AttentionScheduler {
|
|||||||
return std::max(rounded_tile_size, round_size);
|
return std::max(rounded_tile_size, round_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int64_t get_available_l2_size() {
|
||||||
|
static int64_t size = []() {
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
// macOS doesn't have _SC_LEVEL2_CACHE_SIZE. Use sysctlbyname.
|
||||||
|
int64_t l2_cache_size = 0;
|
||||||
|
size_t len = sizeof(l2_cache_size);
|
||||||
|
if (sysctlbyname("hw.l2cachesize", &l2_cache_size, &len, NULL, 0) == 0 &&
|
||||||
|
l2_cache_size > 0) {
|
||||||
|
return l2_cache_size >> 1; // use 50% of L2 cache
|
||||||
|
}
|
||||||
|
// Fallback if sysctlbyname fails
|
||||||
|
return 128LL * 1024 >> 1; // use 50% of 128KB
|
||||||
|
#else
|
||||||
|
long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE);
|
||||||
|
TORCH_CHECK_NE(l2_cache_size, -1);
|
||||||
|
return l2_cache_size >> 1; // use 50% of L2 cache
|
||||||
|
#endif
|
||||||
|
}();
|
||||||
|
return size;
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
int64_t available_cache_size_;
|
int64_t available_cache_size_;
|
||||||
};
|
};
|
||||||
@@ -1380,7 +1402,7 @@ class AttentionMainLoop {
|
|||||||
|
|
||||||
// init buffers
|
// init buffers
|
||||||
void* scratchpad_ptr =
|
void* scratchpad_ptr =
|
||||||
cpu_utils::ScratchPadManager::get_scratchpad_manager()
|
DNNLScratchPadManager::get_dnnl_scratchpad_manager()
|
||||||
->get_data<void>();
|
->get_data<void>();
|
||||||
AttentionScratchPad buffer_manager(thread_id, metadata, scratchpad_ptr);
|
AttentionScratchPad buffer_manager(thread_id, metadata, scratchpad_ptr);
|
||||||
|
|
||||||
@@ -1400,7 +1422,8 @@ class AttentionMainLoop {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const int64_t available_cache_size = cpu_utils::get_available_l2_size();
|
const int64_t available_cache_size =
|
||||||
|
AttentionScheduler::get_available_l2_size();
|
||||||
const int32_t default_tile_size =
|
const int32_t default_tile_size =
|
||||||
AttentionScheduler::calcu_default_tile_size(
|
AttentionScheduler::calcu_default_tile_size(
|
||||||
available_cache_size, head_dim, sizeof(kv_cache_t),
|
available_cache_size, head_dim, sizeof(kv_cache_t),
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
#ifndef CPU_ARCH_MACROS_H
|
#ifndef CPU_ATTN_MACROS_H
|
||||||
#define CPU_ARCH_MACROS_H
|
#define CPU_ATTN_MACROS_H
|
||||||
|
|
||||||
// x86_64
|
// x86_64
|
||||||
#ifdef __x86_64__
|
#ifdef __x86_64__
|
||||||
@@ -26,7 +26,7 @@
|
|||||||
_mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218)); \
|
_mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218)); \
|
||||||
const __m512i vec_127 = _mm512_set1_epi32(0x0000007f); \
|
const __m512i vec_127 = _mm512_set1_epi32(0x0000007f); \
|
||||||
const int n_mantissa_bits = 23; \
|
const int n_mantissa_bits = 23; \
|
||||||
auto fast_exp = [&](const vec_op::FP32Vec16& vec) __attribute__(( \
|
auto fast_exp = [&](vec_op::FP32Vec16& vec) __attribute__(( \
|
||||||
always_inline)) { \
|
always_inline)) { \
|
||||||
__m512 values = vec.reg; \
|
__m512 values = vec.reg; \
|
||||||
auto less_ln_flt_min_mask = \
|
auto less_ln_flt_min_mask = \
|
||||||
@@ -98,7 +98,7 @@
|
|||||||
poly = vbslq_f32(hi_mask, inf, poly); \
|
poly = vbslq_f32(hi_mask, inf, poly); \
|
||||||
return vbslq_f32(lo_mask, zero, poly); \
|
return vbslq_f32(lo_mask, zero, poly); \
|
||||||
}; \
|
}; \
|
||||||
auto fast_exp = [&](const vec_op::FP32Vec16& vec) \
|
auto fast_exp = [&](vec_op::FP32Vec16& vec) \
|
||||||
__attribute__((always_inline)) { \
|
__attribute__((always_inline)) { \
|
||||||
float32x4x4_t result; \
|
float32x4x4_t result; \
|
||||||
result.val[0] = neon_expf(vec.reg.val[0]); \
|
result.val[0] = neon_expf(vec.reg.val[0]); \
|
||||||
@@ -110,4 +110,4 @@
|
|||||||
|
|
||||||
#endif // __aarch64__
|
#endif // __aarch64__
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
@@ -264,7 +264,7 @@ class AttentionImpl<ISA::NEON, scalar_t, head_dim> {
|
|||||||
constexpr static ISA ISAType = ISA::NEON;
|
constexpr static ISA ISAType = ISA::NEON;
|
||||||
constexpr static bool scale_on_logits = false; // apply scale on q_buffer
|
constexpr static bool scale_on_logits = false; // apply scale on q_buffer
|
||||||
|
|
||||||
// static_assert(HeadDim % HeadDimAlignment == 0);
|
static_assert(HeadDim % HeadDimAlignment == 0);
|
||||||
// the gemm micro kernel is Mx8
|
// the gemm micro kernel is Mx8
|
||||||
static_assert(HeadDimAlignment % 8 == 0);
|
static_assert(HeadDimAlignment % 8 == 0);
|
||||||
static_assert(BlockSizeAlignment % 8 == 0);
|
static_assert(BlockSizeAlignment % 8 == 0);
|
||||||
|
|||||||
@@ -1,727 +0,0 @@
|
|||||||
#include "cpu/cpu_types.hpp"
|
|
||||||
#include "cpu/utils.hpp"
|
|
||||||
#include "cpu/micro_gemm/cpu_micro_gemm_vec.hpp"
|
|
||||||
#include "cpu/cpu_arch_macros.h"
|
|
||||||
|
|
||||||
#ifdef CPU_CAPABILITY_AMXBF16
|
|
||||||
#include "cpu/micro_gemm/cpu_micro_gemm_amx.hpp"
|
|
||||||
#define AMX_DISPATCH(...) \
|
|
||||||
case cpu_utils::ISA::AMX: { \
|
|
||||||
using gemm_t = cpu_micro_gemm::MicroGemm<cpu_utils::ISA::AMX, scalar_t>; \
|
|
||||||
return __VA_ARGS__(); \
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
#define AMX_DISPATCH(...) case cpu_utils::ISA::AMX:
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define CPU_ISA_DISPATCH_IMPL(ISA_TYPE, ...) \
|
|
||||||
[&] { \
|
|
||||||
switch (ISA_TYPE) { \
|
|
||||||
AMX_DISPATCH(__VA_ARGS__) \
|
|
||||||
case cpu_utils::ISA::VEC: { \
|
|
||||||
using gemm_t = \
|
|
||||||
cpu_micro_gemm::MicroGemm<cpu_utils::ISA::VEC, scalar_t>; \
|
|
||||||
return __VA_ARGS__(); \
|
|
||||||
} \
|
|
||||||
default: { \
|
|
||||||
TORCH_CHECK(false, "Invalid CPU ISA type."); \
|
|
||||||
} \
|
|
||||||
} \
|
|
||||||
}()
|
|
||||||
|
|
||||||
namespace {
|
|
||||||
enum class FusedMOEAct { SiluAndMul, SwigluOAIAndMul };
|
|
||||||
|
|
||||||
FusedMOEAct get_act_type(const std::string& act) {
|
|
||||||
if (act == "silu") {
|
|
||||||
return FusedMOEAct::SiluAndMul;
|
|
||||||
} else if (act == "swigluoai") {
|
|
||||||
return FusedMOEAct::SwigluOAIAndMul;
|
|
||||||
} else {
|
|
||||||
TORCH_CHECK(false, "Invalid act type: " + act);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename scalar_t>
|
|
||||||
void swigluoai_and_mul(float* __restrict__ input, scalar_t* __restrict__ output,
|
|
||||||
const int32_t m_size, const int32_t n_size,
|
|
||||||
const int32_t input_stride,
|
|
||||||
const int32_t output_stride) {
|
|
||||||
using scalar_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
|
|
||||||
// For GPT-OSS interleaved gate-up weights
|
|
||||||
alignas(64) static int32_t index[16] = {0, 2, 4, 6, 8, 10, 12, 14,
|
|
||||||
16, 18, 20, 22, 24, 26, 28, 30};
|
|
||||||
vec_op::INT32Vec16 index_vec(index);
|
|
||||||
vec_op::FP32Vec16 gate_up_max_vec(7.0);
|
|
||||||
vec_op::FP32Vec16 up_min_vec(-7.0);
|
|
||||||
vec_op::FP32Vec16 alpha_vec(1.702);
|
|
||||||
vec_op::FP32Vec16 one_vec(1.0);
|
|
||||||
|
|
||||||
DEFINE_FAST_EXP
|
|
||||||
|
|
||||||
for (int32_t m = 0; m < m_size; ++m) {
|
|
||||||
for (int32_t n = 0; n < n_size; n += 32) {
|
|
||||||
vec_op::FP32Vec16 gate_vec(input + n, index_vec);
|
|
||||||
vec_op::FP32Vec16 up_vec(input + n + 1, index_vec);
|
|
||||||
gate_vec = gate_vec.min(gate_up_max_vec);
|
|
||||||
up_vec = up_vec.clamp(up_min_vec, gate_up_max_vec);
|
|
||||||
auto sigmoid_vec = one_vec / (one_vec + fast_exp(-gate_vec * alpha_vec));
|
|
||||||
auto glu = gate_vec * sigmoid_vec;
|
|
||||||
auto gated_output_fp32 = (one_vec + up_vec) * glu;
|
|
||||||
scalar_vec_t gated_output = scalar_vec_t(gated_output_fp32);
|
|
||||||
gated_output.save(output + n / 2);
|
|
||||||
}
|
|
||||||
input += input_stride;
|
|
||||||
output += output_stride;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename scalar_t>
|
|
||||||
void silu_and_mul(float* __restrict__ input, scalar_t* __restrict__ output,
|
|
||||||
const int32_t m_size, const int32_t n_size,
|
|
||||||
const int32_t input_stride, const int32_t output_stride) {
|
|
||||||
using scalar_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
|
|
||||||
const int32_t dim = n_size / 2;
|
|
||||||
float* __restrict__ gate = input;
|
|
||||||
float* __restrict__ up = input + dim;
|
|
||||||
vec_op::FP32Vec16 one_vec(1.0);
|
|
||||||
|
|
||||||
DEFINE_FAST_EXP
|
|
||||||
|
|
||||||
for (int32_t m = 0; m < m_size; ++m) {
|
|
||||||
for (int32_t n = 0; n < dim; n += 16) {
|
|
||||||
vec_op::FP32Vec16 gate_vec(gate + n);
|
|
||||||
vec_op::FP32Vec16 up_vec(up + n);
|
|
||||||
auto sigmoid_vec = one_vec / (one_vec + fast_exp(-gate_vec));
|
|
||||||
auto silu = gate_vec * sigmoid_vec;
|
|
||||||
auto gated_output_fp32 = up_vec * silu;
|
|
||||||
scalar_vec_t gated_output = scalar_vec_t(gated_output_fp32);
|
|
||||||
gated_output.save(output + n);
|
|
||||||
}
|
|
||||||
gate += input_stride;
|
|
||||||
up += input_stride;
|
|
||||||
output += output_stride;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename scalar_t>
|
|
||||||
FORCE_INLINE void apply_gated_act(const FusedMOEAct act,
|
|
||||||
float* __restrict__ input,
|
|
||||||
scalar_t* __restrict__ output,
|
|
||||||
const int32_t m, const int32_t n,
|
|
||||||
const int32_t input_stride,
|
|
||||||
const int32_t output_stride) {
|
|
||||||
switch (act) {
|
|
||||||
case FusedMOEAct::SwigluOAIAndMul:
|
|
||||||
swigluoai_and_mul(input, output, m, n, input_stride, output_stride);
|
|
||||||
return;
|
|
||||||
case FusedMOEAct::SiluAndMul:
|
|
||||||
silu_and_mul(input, output, m, n, input_stride, output_stride);
|
|
||||||
return;
|
|
||||||
default:
|
|
||||||
TORCH_CHECK(false, "Unsupported act type.");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename scalar_t, typename gemm_t>
|
|
||||||
void prepack_moe_weight_impl(scalar_t* __restrict__ weight_ptr,
|
|
||||||
scalar_t* __restrict__ packed_weight_ptr,
|
|
||||||
const int32_t expert_num,
|
|
||||||
const int32_t output_size,
|
|
||||||
const int32_t input_size,
|
|
||||||
const int64_t expert_stride) {
|
|
||||||
#pragma omp parallel for
|
|
||||||
for (int32_t e_idx = 0; e_idx < expert_num; ++e_idx) {
|
|
||||||
gemm_t::pack_weight(weight_ptr + expert_stride * e_idx,
|
|
||||||
packed_weight_ptr + expert_stride * e_idx, output_size,
|
|
||||||
input_size);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename scalar_t, typename w_t, typename gemm_t>
|
|
||||||
void fused_moe_impl(scalar_t* __restrict__ output, scalar_t* __restrict__ input,
|
|
||||||
w_t* __restrict__ w13, w_t* __restrict__ w2,
|
|
||||||
w_t* __restrict__ w13_bias, w_t* __restrict__ w2_bias,
|
|
||||||
float* __restrict__ topk_weights,
|
|
||||||
int32_t* __restrict__ topk_id, FusedMOEAct act_type,
|
|
||||||
const int32_t token_num, const int32_t expert_num,
|
|
||||||
const int32_t topk_num, const int32_t input_size_13,
|
|
||||||
const int32_t output_size_13, const int32_t input_size_2,
|
|
||||||
const int32_t output_size_2) {
|
|
||||||
using scalar_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
|
|
||||||
constexpr int32_t gemm_n_tile_size = gemm_t::NSize;
|
|
||||||
constexpr int32_t gemm_m_tile_size = gemm_t::MaxMSize;
|
|
||||||
constexpr int32_t min_w13_n_tile_size = 2 * gemm_n_tile_size;
|
|
||||||
static_assert(gemm_n_tile_size % 16 == 0);
|
|
||||||
|
|
||||||
TORCH_CHECK_EQ(output_size_13 % min_w13_n_tile_size, 0);
|
|
||||||
TORCH_CHECK_EQ(output_size_2 % gemm_n_tile_size, 0);
|
|
||||||
TORCH_CHECK_EQ(output_size_13 / 2, input_size_2);
|
|
||||||
|
|
||||||
const int32_t thread_num = omp_get_max_threads();
|
|
||||||
|
|
||||||
const int32_t w13_input_buffer_size = cpu_utils::round_up<64>(
|
|
||||||
gemm_m_tile_size * input_size_13 * sizeof(scalar_t));
|
|
||||||
|
|
||||||
const int32_t w13_n_tile_size = [&]() {
|
|
||||||
const int64_t cache_size = cpu_utils::get_available_l2_size();
|
|
||||||
// input buffer + output buffer + weight
|
|
||||||
const int32_t n_size_cache_limit =
|
|
||||||
(cache_size - w13_input_buffer_size) /
|
|
||||||
(gemm_m_tile_size * sizeof(float) + input_size_13 * sizeof(scalar_t));
|
|
||||||
const int32_t n_size_thread_limit =
|
|
||||||
output_size_13 / std::max(1, thread_num / topk_num);
|
|
||||||
const int32_t n_size = cpu_utils::round_down<min_w13_n_tile_size>(
|
|
||||||
std::min(n_size_cache_limit, n_size_thread_limit));
|
|
||||||
return std::max(n_size, min_w13_n_tile_size);
|
|
||||||
}();
|
|
||||||
|
|
||||||
const int32_t w2_input_tile_size = cpu_utils::round_up<64>(
|
|
||||||
gemm_m_tile_size * input_size_2 * sizeof(scalar_t));
|
|
||||||
|
|
||||||
const int32_t w2_n_tile_size = [&]() {
|
|
||||||
const int64_t cache_size = cpu_utils::get_available_l2_size();
|
|
||||||
// input tile + weight
|
|
||||||
const int32_t n_size_cache_limit =
|
|
||||||
(cache_size - w2_input_tile_size) / (input_size_2 * sizeof(scalar_t));
|
|
||||||
const int32_t n_size_thread_limit =
|
|
||||||
output_size_2 / std::max(1, thread_num / topk_num);
|
|
||||||
const int32_t n_size = cpu_utils::round_down<gemm_n_tile_size>(
|
|
||||||
std::min(n_size_cache_limit, n_size_thread_limit));
|
|
||||||
return std::max(n_size, gemm_n_tile_size);
|
|
||||||
}();
|
|
||||||
|
|
||||||
// allocate buffers
|
|
||||||
int32_t common_buffer_offset = 0;
|
|
||||||
int32_t w13_thread_buffer_offset = 0;
|
|
||||||
int32_t ws_thread_buffer_offset = 0;
|
|
||||||
|
|
||||||
// common buffers
|
|
||||||
const int32_t token_num_per_group_buffer_size =
|
|
||||||
cpu_utils::round_up<64>(expert_num * sizeof(int32_t));
|
|
||||||
const int32_t token_num_per_group_buffer_offset = common_buffer_offset;
|
|
||||||
common_buffer_offset += token_num_per_group_buffer_size;
|
|
||||||
|
|
||||||
const int32_t cu_token_num_per_group_buffer_size =
|
|
||||||
cpu_utils::round_up<64>((expert_num + 1) * sizeof(int32_t));
|
|
||||||
const int32_t cu_token_num_per_group_buffer_offset = common_buffer_offset;
|
|
||||||
common_buffer_offset += cu_token_num_per_group_buffer_size;
|
|
||||||
|
|
||||||
const int32_t expand_token_id_buffer_size =
|
|
||||||
cpu_utils::round_up<64>(token_num * topk_num * sizeof(int32_t));
|
|
||||||
const int32_t expand_token_id_buffer_offset = common_buffer_offset;
|
|
||||||
common_buffer_offset += expand_token_id_buffer_size;
|
|
||||||
|
|
||||||
const int32_t expand_token_id_index_buffer_size =
|
|
||||||
cpu_utils::round_up<64>(token_num * topk_num * sizeof(int32_t));
|
|
||||||
const int32_t expand_token_id_index_buffer_offset = common_buffer_offset;
|
|
||||||
common_buffer_offset += expand_token_id_index_buffer_size;
|
|
||||||
|
|
||||||
const int32_t w13_gemm_output_buffer_size = cpu_utils::round_up<64>(
|
|
||||||
token_num * topk_num * (output_size_13 / 2) * sizeof(scalar_t));
|
|
||||||
const int32_t w13_gemm_output_buffer_offset = common_buffer_offset;
|
|
||||||
common_buffer_offset += w13_gemm_output_buffer_size;
|
|
||||||
|
|
||||||
const int32_t w2_gemm_output_buffer_size = cpu_utils::round_up<64>(
|
|
||||||
token_num * topk_num * output_size_2 * sizeof(float));
|
|
||||||
const int32_t w2_gemm_output_buffer_offset = common_buffer_offset;
|
|
||||||
common_buffer_offset += w2_gemm_output_buffer_size;
|
|
||||||
|
|
||||||
// w13 GEMM thread buffers
|
|
||||||
const int32_t w13_input_buffer_offset = w13_thread_buffer_offset;
|
|
||||||
w13_thread_buffer_offset += w13_input_buffer_size;
|
|
||||||
|
|
||||||
const int32_t w13_output_buffer_size = cpu_utils::round_up<64>(
|
|
||||||
gemm_m_tile_size * w13_n_tile_size * sizeof(float));
|
|
||||||
const int32_t w13_output_buffer_offset = w13_thread_buffer_offset;
|
|
||||||
w13_thread_buffer_offset += w13_output_buffer_size;
|
|
||||||
|
|
||||||
// Weighted sum thread buffer
|
|
||||||
const int32_t ws_output_buffer_size =
|
|
||||||
cpu_utils::round_up<64>(output_size_2 * sizeof(float));
|
|
||||||
const int32_t ws_output_buffer_offset = ws_thread_buffer_offset;
|
|
||||||
ws_thread_buffer_offset += ws_output_buffer_size;
|
|
||||||
|
|
||||||
const int32_t buffer_size =
|
|
||||||
common_buffer_offset +
|
|
||||||
std::max(w13_thread_buffer_offset, ws_thread_buffer_offset) * thread_num;
|
|
||||||
cpu_utils::ScratchPadManager::get_scratchpad_manager()->realloc(buffer_size);
|
|
||||||
uint8_t* common_buffer_start =
|
|
||||||
cpu_utils::ScratchPadManager::get_scratchpad_manager()
|
|
||||||
->get_data<uint8_t>();
|
|
||||||
uint8_t* thread_buffer_start = common_buffer_start + common_buffer_offset;
|
|
||||||
|
|
||||||
int32_t* __restrict__ token_num_per_group_buffer = reinterpret_cast<int32_t*>(
|
|
||||||
common_buffer_start + token_num_per_group_buffer_offset);
|
|
||||||
int32_t* __restrict__ cu_token_num_per_group_buffer =
|
|
||||||
reinterpret_cast<int32_t*>(common_buffer_start +
|
|
||||||
cu_token_num_per_group_buffer_offset);
|
|
||||||
int32_t* __restrict__ expand_token_id_buffer = reinterpret_cast<int32_t*>(
|
|
||||||
common_buffer_start + expand_token_id_buffer_offset);
|
|
||||||
int32_t* __restrict__ expand_token_id_index_buffer =
|
|
||||||
reinterpret_cast<int32_t*>(common_buffer_start +
|
|
||||||
expand_token_id_index_buffer_offset);
|
|
||||||
|
|
||||||
// prepare token-expert mappings
|
|
||||||
{
|
|
||||||
std::memset(token_num_per_group_buffer, 0, expert_num * sizeof(int32_t));
|
|
||||||
for (int32_t i = 0; i < token_num * topk_num; ++i) {
|
|
||||||
int32_t curr_expert_id = topk_id[i];
|
|
||||||
++token_num_per_group_buffer[curr_expert_id];
|
|
||||||
}
|
|
||||||
|
|
||||||
int32_t token_num_sum = 0;
|
|
||||||
cu_token_num_per_group_buffer[0] = 0;
|
|
||||||
int32_t* token_index_buffer = cu_token_num_per_group_buffer + 1;
|
|
||||||
for (int32_t i = 0; i < expert_num; ++i) {
|
|
||||||
token_index_buffer[i] = token_num_sum;
|
|
||||||
token_num_sum += token_num_per_group_buffer[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int32_t i = 0; i < token_num; ++i) {
|
|
||||||
int32_t* curr_topk_id = topk_id + i * topk_num;
|
|
||||||
int32_t* curr_index_buffer = expand_token_id_index_buffer + i * topk_num;
|
|
||||||
for (int32_t j = 0; j < topk_num; ++j) {
|
|
||||||
int32_t curr_expert_id = curr_topk_id[j];
|
|
||||||
int32_t curr_index = token_index_buffer[curr_expert_id];
|
|
||||||
++token_index_buffer[curr_expert_id];
|
|
||||||
expand_token_id_buffer[curr_index] = i;
|
|
||||||
curr_index_buffer[j] = curr_index;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// w13 GEMM + act
|
|
||||||
{
|
|
||||||
alignas(64) cpu_utils::Counter counter;
|
|
||||||
cpu_utils::Counter* counter_ptr = &counter;
|
|
||||||
|
|
||||||
#pragma omp parallel for schedule(static, 1)
|
|
||||||
for (int32_t thread_id = 0; thread_id < thread_num; ++thread_id) {
|
|
||||||
const int32_t task_num_per_expert =
|
|
||||||
(output_size_13 + w13_n_tile_size - 1) / w13_n_tile_size;
|
|
||||||
const int32_t task_num = task_num_per_expert * expert_num;
|
|
||||||
|
|
||||||
uint8_t* __restrict__ thread_buffer =
|
|
||||||
thread_buffer_start + thread_id * w13_thread_buffer_offset;
|
|
||||||
scalar_t* __restrict__ w13_input_buffer =
|
|
||||||
reinterpret_cast<scalar_t*>(thread_buffer + w13_input_buffer_offset);
|
|
||||||
float* __restrict__ w13_output_buffer =
|
|
||||||
reinterpret_cast<float*>(thread_buffer + w13_output_buffer_offset);
|
|
||||||
scalar_t* __restrict__ w13_gemm_output_buffer =
|
|
||||||
reinterpret_cast<scalar_t*>(common_buffer_start +
|
|
||||||
w13_gemm_output_buffer_offset);
|
|
||||||
|
|
||||||
gemm_t gemm;
|
|
||||||
|
|
||||||
const int32_t input_size_13_bytes = input_size_13 * sizeof(scalar_t);
|
|
||||||
const int32_t w13_n_group_stride = 16 * input_size_13;
|
|
||||||
const int32_t w13_n_tile_stride = gemm_n_tile_size * input_size_13;
|
|
||||||
|
|
||||||
for (;;) {
|
|
||||||
int32_t task_id = counter_ptr->acquire_counter();
|
|
||||||
if (task_id >= task_num) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
const int32_t curr_expert_id = task_id / task_num_per_expert;
|
|
||||||
const int32_t curr_output_group_id = task_id % task_num_per_expert;
|
|
||||||
const int32_t curr_token_num =
|
|
||||||
token_num_per_group_buffer[curr_expert_id];
|
|
||||||
if (curr_token_num == 0) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
const int32_t actual_n_tile_size =
|
|
||||||
std::min(w13_n_tile_size,
|
|
||||||
output_size_13 - curr_output_group_id * w13_n_tile_size);
|
|
||||||
const int32_t* __restrict__ curr_expand_token_id_buffer =
|
|
||||||
expand_token_id_buffer +
|
|
||||||
cu_token_num_per_group_buffer[curr_expert_id];
|
|
||||||
scalar_t* __restrict__ curr_w13_gemm_output_buffer =
|
|
||||||
w13_gemm_output_buffer +
|
|
||||||
cu_token_num_per_group_buffer[curr_expert_id] *
|
|
||||||
(output_size_13 / 2) +
|
|
||||||
curr_output_group_id * w13_n_tile_size / 2;
|
|
||||||
|
|
||||||
w_t* __restrict__ w13_weight_ptr_0 = nullptr;
|
|
||||||
w_t* __restrict__ w13_weight_ptr_1 = nullptr;
|
|
||||||
w_t* __restrict__ w13_bias_ptr_0 = nullptr;
|
|
||||||
w_t* __restrict__ w13_bias_ptr_1 = nullptr;
|
|
||||||
if (act_type == FusedMOEAct::SwigluOAIAndMul) {
|
|
||||||
// For SwigluOAIAndMul, up and down weights are interleaved
|
|
||||||
w13_weight_ptr_0 =
|
|
||||||
w13 + curr_expert_id * input_size_13 * output_size_13 +
|
|
||||||
curr_output_group_id * w13_n_tile_size * input_size_13;
|
|
||||||
w13_weight_ptr_1 =
|
|
||||||
w13_weight_ptr_0 + actual_n_tile_size / 2 * input_size_13;
|
|
||||||
if (w13_bias != nullptr) {
|
|
||||||
w13_bias_ptr_0 = w13_bias + curr_expert_id * output_size_13 +
|
|
||||||
curr_output_group_id * w13_n_tile_size;
|
|
||||||
w13_bias_ptr_1 = w13_bias_ptr_0 + actual_n_tile_size / 2;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
w13_weight_ptr_0 =
|
|
||||||
w13 + curr_expert_id * input_size_13 * output_size_13 +
|
|
||||||
curr_output_group_id * (w13_n_tile_size / 2) * input_size_13;
|
|
||||||
w13_weight_ptr_1 =
|
|
||||||
w13_weight_ptr_0 + output_size_13 / 2 * input_size_13;
|
|
||||||
if (w13_bias != nullptr) {
|
|
||||||
w13_bias_ptr_0 = w13_bias + curr_expert_id * output_size_13 +
|
|
||||||
curr_output_group_id * (w13_n_tile_size / 2);
|
|
||||||
w13_bias_ptr_1 = w13_bias_ptr_0 + output_size_13 / 2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
scalar_t* __restrict__ curr_w13_input_buffer = w13_input_buffer;
|
|
||||||
for (int32_t token_idx = 0; token_idx < curr_token_num;
|
|
||||||
token_idx += gemm_m_tile_size) {
|
|
||||||
const int32_t actual_token_num =
|
|
||||||
std::min(gemm_m_tile_size, curr_token_num - token_idx);
|
|
||||||
// copy inputs
|
|
||||||
{
|
|
||||||
scalar_t* __restrict__ curr_w13_input_buffer_iter =
|
|
||||||
curr_w13_input_buffer;
|
|
||||||
for (int32_t i = 0; i < actual_token_num; ++i) {
|
|
||||||
const int32_t curr_token_id = curr_expand_token_id_buffer[i];
|
|
||||||
int8_t* __restrict__ curr_input_iter = reinterpret_cast<int8_t*>(
|
|
||||||
input + curr_token_id * input_size_13);
|
|
||||||
int8_t* __restrict__ curr_output_iter =
|
|
||||||
reinterpret_cast<int8_t*>(curr_w13_input_buffer_iter);
|
|
||||||
int32_t j = 0;
|
|
||||||
for (; j < input_size_13_bytes - 64; j += 64) {
|
|
||||||
vec_op::INT8Vec64 vec(curr_input_iter);
|
|
||||||
vec.save(curr_output_iter);
|
|
||||||
curr_input_iter += 64;
|
|
||||||
curr_output_iter += 64;
|
|
||||||
}
|
|
||||||
vec_op::INT8Vec64 vec(curr_input_iter);
|
|
||||||
vec.save(curr_output_iter, input_size_13_bytes - j);
|
|
||||||
|
|
||||||
// update
|
|
||||||
curr_w13_input_buffer_iter += input_size_13;
|
|
||||||
}
|
|
||||||
// update
|
|
||||||
curr_expand_token_id_buffer += actual_token_num;
|
|
||||||
}
|
|
||||||
|
|
||||||
// gemm + act
|
|
||||||
{
|
|
||||||
scalar_t* __restrict__ w13_weight_ptr_0_iter = w13_weight_ptr_0;
|
|
||||||
scalar_t* __restrict__ w13_weight_ptr_1_iter = w13_weight_ptr_1;
|
|
||||||
scalar_t* __restrict__ w13_bias_ptr_0_iter = w13_bias_ptr_0;
|
|
||||||
scalar_t* __restrict__ w13_bias_ptr_1_iter = w13_bias_ptr_1;
|
|
||||||
scalar_t* __restrict__ curr_w13_input_buffer_iter =
|
|
||||||
curr_w13_input_buffer;
|
|
||||||
float* __restrict__ w13_output_buffer_0_iter = w13_output_buffer;
|
|
||||||
float* __restrict__ w13_output_buffer_1_iter =
|
|
||||||
w13_output_buffer + actual_n_tile_size / 2;
|
|
||||||
for (int32_t i = 0; i < actual_n_tile_size;
|
|
||||||
i += min_w13_n_tile_size) {
|
|
||||||
gemm.gemm(curr_w13_input_buffer_iter, w13_weight_ptr_0_iter,
|
|
||||||
w13_output_buffer_0_iter, actual_token_num,
|
|
||||||
input_size_13, input_size_13, w13_n_group_stride,
|
|
||||||
actual_n_tile_size, false);
|
|
||||||
|
|
||||||
if (w13_bias != nullptr) {
|
|
||||||
cpu_micro_gemm::add_bias_epilogue<gemm_n_tile_size>(
|
|
||||||
w13_output_buffer_0_iter, w13_output_buffer_0_iter,
|
|
||||||
w13_bias_ptr_0_iter, actual_token_num, actual_n_tile_size,
|
|
||||||
actual_n_tile_size);
|
|
||||||
w13_bias_ptr_0_iter += gemm_n_tile_size;
|
|
||||||
}
|
|
||||||
|
|
||||||
gemm.gemm(curr_w13_input_buffer_iter, w13_weight_ptr_1_iter,
|
|
||||||
w13_output_buffer_1_iter, actual_token_num,
|
|
||||||
input_size_13, input_size_13, w13_n_group_stride,
|
|
||||||
actual_n_tile_size, false);
|
|
||||||
|
|
||||||
if (w13_bias != nullptr) {
|
|
||||||
cpu_micro_gemm::add_bias_epilogue<gemm_n_tile_size>(
|
|
||||||
w13_output_buffer_1_iter, w13_output_buffer_1_iter,
|
|
||||||
w13_bias_ptr_1_iter, actual_token_num, actual_n_tile_size,
|
|
||||||
actual_n_tile_size);
|
|
||||||
w13_bias_ptr_1_iter += gemm_n_tile_size;
|
|
||||||
}
|
|
||||||
|
|
||||||
// update
|
|
||||||
w13_weight_ptr_0_iter += w13_n_tile_stride;
|
|
||||||
w13_weight_ptr_1_iter += w13_n_tile_stride;
|
|
||||||
w13_output_buffer_0_iter += gemm_n_tile_size;
|
|
||||||
w13_output_buffer_1_iter += gemm_n_tile_size;
|
|
||||||
}
|
|
||||||
|
|
||||||
apply_gated_act(act_type, w13_output_buffer,
|
|
||||||
curr_w13_gemm_output_buffer, actual_token_num,
|
|
||||||
actual_n_tile_size, actual_n_tile_size,
|
|
||||||
output_size_13 / 2);
|
|
||||||
|
|
||||||
// update
|
|
||||||
curr_w13_gemm_output_buffer +=
|
|
||||||
gemm_m_tile_size * (output_size_13 / 2);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// w2 GEMM
|
|
||||||
{
|
|
||||||
alignas(64) cpu_utils::Counter counter;
|
|
||||||
cpu_utils::Counter* counter_ptr = &counter;
|
|
||||||
|
|
||||||
#pragma omp parallel for schedule(static, 1)
|
|
||||||
for (int32_t thread_id = 0; thread_id < thread_num; ++thread_id) {
|
|
||||||
const int32_t task_num_per_expert =
|
|
||||||
(output_size_2 + w2_n_tile_size - 1) / w2_n_tile_size;
|
|
||||||
const int32_t task_num = task_num_per_expert * expert_num;
|
|
||||||
scalar_t* __restrict__ w13_gemm_output_buffer =
|
|
||||||
reinterpret_cast<scalar_t*>(common_buffer_start +
|
|
||||||
w13_gemm_output_buffer_offset);
|
|
||||||
float* __restrict__ w2_gemm_output_buffer = reinterpret_cast<float*>(
|
|
||||||
common_buffer_start + w2_gemm_output_buffer_offset);
|
|
||||||
|
|
||||||
gemm_t gemm;
|
|
||||||
|
|
||||||
const int32_t w2_n_tile_stride = gemm_n_tile_size * input_size_2;
|
|
||||||
const int32_t w2_n_group_stride = 16 * input_size_2;
|
|
||||||
|
|
||||||
for (;;) {
|
|
||||||
int32_t task_id = counter_ptr->acquire_counter();
|
|
||||||
if (task_id >= task_num) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
const int32_t curr_expert_id = task_id / task_num_per_expert;
|
|
||||||
const int32_t curr_output_group_id = task_id % task_num_per_expert;
|
|
||||||
const int32_t curr_token_num =
|
|
||||||
token_num_per_group_buffer[curr_expert_id];
|
|
||||||
if (curr_token_num == 0) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
const int32_t actual_n_tile_size =
|
|
||||||
std::min(w2_n_tile_size,
|
|
||||||
output_size_2 - curr_output_group_id * w2_n_tile_size);
|
|
||||||
scalar_t* __restrict__ curr_w13_gemm_output_buffer =
|
|
||||||
w13_gemm_output_buffer +
|
|
||||||
cu_token_num_per_group_buffer[curr_expert_id] * input_size_2;
|
|
||||||
float* __restrict__ curr_w2_gemm_output_buffer =
|
|
||||||
w2_gemm_output_buffer +
|
|
||||||
cu_token_num_per_group_buffer[curr_expert_id] * output_size_2 +
|
|
||||||
curr_output_group_id * w2_n_tile_size;
|
|
||||||
scalar_t* __restrict__ w2_weight_ptr =
|
|
||||||
w2 + curr_expert_id * output_size_2 * input_size_2 +
|
|
||||||
curr_output_group_id * w2_n_tile_size * input_size_2;
|
|
||||||
scalar_t* __restrict__ w2_bias_ptr = nullptr;
|
|
||||||
if (w2_bias != nullptr) {
|
|
||||||
w2_bias_ptr = w2_bias + curr_expert_id * output_size_2 +
|
|
||||||
curr_output_group_id * w2_n_tile_size;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int32_t token_idx = 0; token_idx < curr_token_num;
|
|
||||||
token_idx += gemm_m_tile_size) {
|
|
||||||
const int32_t actual_token_num =
|
|
||||||
std::min(gemm_m_tile_size, curr_token_num - token_idx);
|
|
||||||
|
|
||||||
scalar_t* __restrict__ w2_weight_ptr_iter = w2_weight_ptr;
|
|
||||||
scalar_t* __restrict__ w2_bias_ptr_iter = w2_bias_ptr;
|
|
||||||
float* __restrict__ curr_w2_gemm_output_buffer_iter =
|
|
||||||
curr_w2_gemm_output_buffer;
|
|
||||||
for (int32_t i = 0; i < actual_n_tile_size; i += gemm_n_tile_size) {
|
|
||||||
gemm.gemm(curr_w13_gemm_output_buffer, w2_weight_ptr_iter,
|
|
||||||
curr_w2_gemm_output_buffer_iter, actual_token_num,
|
|
||||||
input_size_2, input_size_2, w2_n_group_stride,
|
|
||||||
output_size_2, false);
|
|
||||||
|
|
||||||
if (w2_bias != nullptr) {
|
|
||||||
cpu_micro_gemm::add_bias_epilogue<gemm_n_tile_size>(
|
|
||||||
curr_w2_gemm_output_buffer_iter,
|
|
||||||
curr_w2_gemm_output_buffer_iter, w2_bias_ptr_iter,
|
|
||||||
actual_token_num, output_size_2, output_size_2);
|
|
||||||
w2_bias_ptr_iter += gemm_n_tile_size;
|
|
||||||
}
|
|
||||||
|
|
||||||
w2_weight_ptr_iter += w2_n_tile_stride;
|
|
||||||
curr_w2_gemm_output_buffer_iter += gemm_n_tile_size;
|
|
||||||
}
|
|
||||||
|
|
||||||
// update
|
|
||||||
curr_w13_gemm_output_buffer += gemm_m_tile_size * input_size_2;
|
|
||||||
curr_w2_gemm_output_buffer += gemm_m_tile_size * output_size_2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// weighted sum
|
|
||||||
{
|
|
||||||
alignas(64) cpu_utils::Counter counter;
|
|
||||||
cpu_utils::Counter* counter_ptr = &counter;
|
|
||||||
|
|
||||||
#pragma omp parallel for schedule(static, 1)
|
|
||||||
for (int32_t thread_id = 0; thread_id < thread_num; ++thread_id) {
|
|
||||||
const int32_t task_num = token_num;
|
|
||||||
uint8_t* __restrict__ thread_buffer =
|
|
||||||
thread_buffer_start + thread_id * ws_thread_buffer_offset;
|
|
||||||
float* __restrict__ ws_output_buffer =
|
|
||||||
reinterpret_cast<float*>(thread_buffer + ws_output_buffer_offset);
|
|
||||||
float* __restrict__ w2_gemm_output_buffer = reinterpret_cast<float*>(
|
|
||||||
common_buffer_start + w2_gemm_output_buffer_offset);
|
|
||||||
|
|
||||||
for (;;) {
|
|
||||||
int32_t task_id = counter_ptr->acquire_counter();
|
|
||||||
if (task_id >= task_num) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
int32_t token_id = task_id;
|
|
||||||
int32_t* __restrict__ curr_expand_token_id_index_buffer =
|
|
||||||
expand_token_id_index_buffer + token_id * topk_num;
|
|
||||||
float* __restrict__ curr_weight = topk_weights + token_id * topk_num;
|
|
||||||
scalar_t* __restrict__ curr_output_buffer =
|
|
||||||
output + token_id * output_size_2;
|
|
||||||
|
|
||||||
if (topk_num > 1) {
|
|
||||||
{
|
|
||||||
int32_t w2_output_idx = curr_expand_token_id_index_buffer[0];
|
|
||||||
float* __restrict__ w2_output_iter =
|
|
||||||
w2_gemm_output_buffer + w2_output_idx * output_size_2;
|
|
||||||
float* __restrict__ ws_output_buffer_iter = ws_output_buffer;
|
|
||||||
vec_op::FP32Vec16 weight_vec(curr_weight[0]);
|
|
||||||
for (int32_t i = 0; i < output_size_2; i += 16) {
|
|
||||||
vec_op::FP32Vec16 vec(w2_output_iter);
|
|
||||||
vec = vec * weight_vec;
|
|
||||||
vec.save(ws_output_buffer_iter);
|
|
||||||
|
|
||||||
// update
|
|
||||||
w2_output_iter += 16;
|
|
||||||
ws_output_buffer_iter += 16;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
for (int32_t idx = 1; idx < topk_num - 1; ++idx) {
|
|
||||||
int32_t w2_output_idx = curr_expand_token_id_index_buffer[idx];
|
|
||||||
float* __restrict__ w2_output_iter =
|
|
||||||
w2_gemm_output_buffer + w2_output_idx * output_size_2;
|
|
||||||
float* __restrict__ ws_output_buffer_iter = ws_output_buffer;
|
|
||||||
vec_op::FP32Vec16 weight_vec(curr_weight[idx]);
|
|
||||||
for (int32_t i = 0; i < output_size_2; i += 16) {
|
|
||||||
vec_op::FP32Vec16 vec(w2_output_iter);
|
|
||||||
vec_op::FP32Vec16 sum(ws_output_buffer_iter);
|
|
||||||
sum = sum + vec * weight_vec;
|
|
||||||
sum.save(ws_output_buffer_iter);
|
|
||||||
|
|
||||||
// update
|
|
||||||
w2_output_iter += 16;
|
|
||||||
ws_output_buffer_iter += 16;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
int32_t idx = topk_num - 1;
|
|
||||||
int32_t w2_output_idx = curr_expand_token_id_index_buffer[idx];
|
|
||||||
float* __restrict__ w2_output_iter =
|
|
||||||
w2_gemm_output_buffer + w2_output_idx * output_size_2;
|
|
||||||
float* __restrict__ ws_output_buffer_iter = ws_output_buffer;
|
|
||||||
scalar_t* __restrict__ curr_output_buffer_iter = curr_output_buffer;
|
|
||||||
vec_op::FP32Vec16 weight_vec(curr_weight[idx]);
|
|
||||||
for (int32_t i = 0; i < output_size_2; i += 16) {
|
|
||||||
vec_op::FP32Vec16 vec(w2_output_iter);
|
|
||||||
vec_op::FP32Vec16 sum(ws_output_buffer_iter);
|
|
||||||
sum = sum + vec * weight_vec;
|
|
||||||
scalar_vec_t out_vec(sum);
|
|
||||||
out_vec.save(curr_output_buffer_iter);
|
|
||||||
|
|
||||||
// update
|
|
||||||
w2_output_iter += 16;
|
|
||||||
ws_output_buffer_iter += 16;
|
|
||||||
curr_output_buffer_iter += 16;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
int32_t w2_output_idx = curr_expand_token_id_index_buffer[0];
|
|
||||||
float* __restrict__ w2_output_iter =
|
|
||||||
w2_gemm_output_buffer + w2_output_idx * output_size_2;
|
|
||||||
scalar_t* __restrict__ curr_output_buffer_iter = curr_output_buffer;
|
|
||||||
vec_op::FP32Vec16 weight_vec(curr_weight[0]);
|
|
||||||
for (int32_t i = 0; i < output_size_2; i += 16) {
|
|
||||||
vec_op::FP32Vec16 vec(w2_output_iter);
|
|
||||||
vec = vec * weight_vec;
|
|
||||||
scalar_vec_t out_vec(vec);
|
|
||||||
out_vec.save(curr_output_buffer_iter);
|
|
||||||
|
|
||||||
// update
|
|
||||||
w2_output_iter += 16;
|
|
||||||
curr_output_buffer_iter += 16;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
void prepack_moe_weight(
|
|
||||||
const torch::Tensor& weight, // [expert_num, output_size, input_size]
|
|
||||||
torch::Tensor& packed_weight, const std::string& isa) {
|
|
||||||
TORCH_CHECK(weight.is_contiguous());
|
|
||||||
const int32_t expert_num = weight.size(0);
|
|
||||||
const int32_t output_size = weight.size(1);
|
|
||||||
const int32_t input_size = weight.size(2);
|
|
||||||
TORCH_CHECK_EQ(output_size % 32, 0);
|
|
||||||
const int64_t expert_stride = weight.stride(0);
|
|
||||||
cpu_utils::ISA isa_type = cpu_utils::get_isa(isa);
|
|
||||||
|
|
||||||
VLLM_DISPATCH_FLOATING_TYPES(
|
|
||||||
weight.scalar_type(), "prepack_moe_weight", [&]() {
|
|
||||||
CPU_ISA_DISPATCH_IMPL(isa_type, [&]() {
|
|
||||||
scalar_t* weight_ptr = weight.data_ptr<scalar_t>();
|
|
||||||
scalar_t* packed_weight_ptr = packed_weight.data_ptr<scalar_t>();
|
|
||||||
prepack_moe_weight_impl<scalar_t, gemm_t>(
|
|
||||||
weight_ptr, packed_weight_ptr, expert_num, output_size,
|
|
||||||
input_size, expert_stride);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
void cpu_fused_moe(
|
|
||||||
torch::Tensor& output, // [token_num, output_size_2]
|
|
||||||
const torch::Tensor& input, // [token_num, input_size_13]
|
|
||||||
const torch::Tensor&
|
|
||||||
w13, // [expert_num, output_size_13, input_size_13], packed
|
|
||||||
const torch::Tensor&
|
|
||||||
w2, // [expert_num, output_size_2, input_size_2], packed
|
|
||||||
const std::optional<torch::Tensor>&
|
|
||||||
w13_bias, // [expert_num, output_size_13]
|
|
||||||
const std::optional<torch::Tensor>& w2_bias, // [expert_num, output_size_2]
|
|
||||||
const torch::Tensor& topk_weights, // [token_num, k], float32
|
|
||||||
const torch::Tensor& topk_id, // [token_num, k], int32
|
|
||||||
const std::string& act, const std::string& isa) {
|
|
||||||
const int32_t token_num = input.size(0);
|
|
||||||
const int32_t input_size_13 = input.size(1);
|
|
||||||
const int64_t input_stride = input.stride(0);
|
|
||||||
TORCH_CHECK_EQ(input_stride, input_size_13);
|
|
||||||
const int32_t expert_num = w13.size(0);
|
|
||||||
const int32_t output_size_13 = w13.size(1);
|
|
||||||
const int32_t input_size_2 = w2.size(2);
|
|
||||||
const int32_t output_size_2 = w2.size(1);
|
|
||||||
const int32_t topk_num = topk_id.size(1);
|
|
||||||
const FusedMOEAct act_type = get_act_type(act);
|
|
||||||
cpu_utils::ISA isa_type = cpu_utils::get_isa(isa);
|
|
||||||
|
|
||||||
VLLM_DISPATCH_FLOATING_TYPES(w13.scalar_type(), "cpu_fused_moe", [&]() {
|
|
||||||
CPU_ISA_DISPATCH_IMPL(isa_type, [&]() {
|
|
||||||
fused_moe_impl<scalar_t, scalar_t, gemm_t>(
|
|
||||||
output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
|
|
||||||
w13.data_ptr<scalar_t>(), w2.data_ptr<scalar_t>(),
|
|
||||||
w13_bias.has_value() ? w13_bias->data_ptr<scalar_t>() : nullptr,
|
|
||||||
w2_bias.has_value() ? w2_bias->data_ptr<scalar_t>() : nullptr,
|
|
||||||
topk_weights.data_ptr<float>(), topk_id.data_ptr<int32_t>(), act_type,
|
|
||||||
token_num, expert_num, topk_num, input_size_13, output_size_13,
|
|
||||||
input_size_2, output_size_2);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
}
|
|
||||||
@@ -352,10 +352,6 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
|||||||
explicit FP32Vec16(bool, void* ptr)
|
explicit FP32Vec16(bool, void* ptr)
|
||||||
: reg((__m512)_mm512_stream_load_si512(ptr)) {}
|
: reg((__m512)_mm512_stream_load_si512(ptr)) {}
|
||||||
|
|
||||||
// strided load
|
|
||||||
explicit FP32Vec16(const float* ptr, INT32Vec16 idx)
|
|
||||||
: reg(_mm512_i32gather_ps(idx.reg, ptr, 4)) {}
|
|
||||||
|
|
||||||
explicit FP32Vec16(__m512 data) : reg(data) {}
|
explicit FP32Vec16(__m512 data) : reg(data) {}
|
||||||
|
|
||||||
// de-pack 4 bit values
|
// de-pack 4 bit values
|
||||||
@@ -412,10 +408,6 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
|||||||
return FP32Vec16(_mm512_sub_ps(reg, b.reg));
|
return FP32Vec16(_mm512_sub_ps(reg, b.reg));
|
||||||
}
|
}
|
||||||
|
|
||||||
FP32Vec16 operator-() const {
|
|
||||||
return FP32Vec16(_mm512_xor_ps(reg, _mm512_set1_ps(-0.0f)));
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec16 operator/(const FP32Vec16& b) const {
|
FP32Vec16 operator/(const FP32Vec16& b) const {
|
||||||
return FP32Vec16(_mm512_div_ps(reg, b.reg));
|
return FP32Vec16(_mm512_div_ps(reg, b.reg));
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
#include "cpu/cpu_types.hpp"
|
#include "cpu_types.hpp"
|
||||||
#include "cpu/utils.hpp"
|
#include "scratchpad_manager.h"
|
||||||
|
#include "utils.hpp"
|
||||||
|
|
||||||
#ifdef CPU_CAPABILITY_AMXBF16
|
#ifdef CPU_CAPABILITY_AMXBF16
|
||||||
#include "cpu/micro_gemm/cpu_micro_gemm_amx.hpp"
|
#include "cpu/micro_gemm/cpu_micro_gemm_amx.hpp"
|
||||||
@@ -157,7 +158,7 @@ void cpu_gemm_wna16_impl(
|
|||||||
// a simple schedule policy, just to hold more B tiles in L2 and make sure
|
// a simple schedule policy, just to hold more B tiles in L2 and make sure
|
||||||
// each thread has tasks
|
// each thread has tasks
|
||||||
const int32_t n_partition_size = [&]() {
|
const int32_t n_partition_size = [&]() {
|
||||||
const int64_t cache_size = cpu_utils::get_available_l2_size();
|
const int64_t cache_size = cpu_utils::get_l2_size();
|
||||||
int64_t ps_cache_limit = cache_size / (k_size * sizeof(scalar_t));
|
int64_t ps_cache_limit = cache_size / (k_size * sizeof(scalar_t));
|
||||||
int64_t ps_thread_limit = n_size / thread_num;
|
int64_t ps_thread_limit = n_size / thread_num;
|
||||||
ps_cache_limit =
|
ps_cache_limit =
|
||||||
@@ -178,8 +179,8 @@ void cpu_gemm_wna16_impl(
|
|||||||
const int64_t b_buffer_offset = 0;
|
const int64_t b_buffer_offset = 0;
|
||||||
const int64_t c_buffer_offset = b_buffer_size;
|
const int64_t c_buffer_offset = b_buffer_size;
|
||||||
const int64_t buffer_size = b_buffer_size + c_buffer_size;
|
const int64_t buffer_size = b_buffer_size + c_buffer_size;
|
||||||
cpu_utils::ScratchPadManager::get_scratchpad_manager()->realloc(buffer_size *
|
DNNLScratchPadManager::get_dnnl_scratchpad_manager()->realloc(buffer_size *
|
||||||
thread_num);
|
thread_num);
|
||||||
|
|
||||||
alignas(64) cpu_utils::Counter counter;
|
alignas(64) cpu_utils::Counter counter;
|
||||||
cpu_utils::Counter* counter_ptr = &counter;
|
cpu_utils::Counter* counter_ptr = &counter;
|
||||||
@@ -189,10 +190,9 @@ void cpu_gemm_wna16_impl(
|
|||||||
scalar_t* __restrict__ b_buffer = nullptr;
|
scalar_t* __restrict__ b_buffer = nullptr;
|
||||||
float* __restrict__ c_buffer = nullptr;
|
float* __restrict__ c_buffer = nullptr;
|
||||||
{
|
{
|
||||||
uint8_t* buffer_ptr =
|
uint8_t* buffer_ptr = DNNLScratchPadManager::get_dnnl_scratchpad_manager()
|
||||||
cpu_utils::ScratchPadManager::get_scratchpad_manager()
|
->get_data<uint8_t>() +
|
||||||
->get_data<uint8_t>() +
|
thread_id * buffer_size;
|
||||||
thread_id * buffer_size;
|
|
||||||
b_buffer = reinterpret_cast<scalar_t*>(buffer_ptr + b_buffer_offset);
|
b_buffer = reinterpret_cast<scalar_t*>(buffer_ptr + b_buffer_offset);
|
||||||
c_buffer = reinterpret_cast<float*>(buffer_ptr + c_buffer_offset);
|
c_buffer = reinterpret_cast<float*>(buffer_ptr + c_buffer_offset);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,8 +4,8 @@
|
|||||||
#include "common/memory_desc.hpp"
|
#include "common/memory_desc.hpp"
|
||||||
#include "common/memory.hpp"
|
#include "common/memory.hpp"
|
||||||
|
|
||||||
#include "cpu/utils.hpp"
|
#include "dnnl_helper.h"
|
||||||
#include "cpu/dnnl_helper.h"
|
#include "scratchpad_manager.h"
|
||||||
|
|
||||||
static dnnl::engine& default_engine() {
|
static dnnl::engine& default_engine() {
|
||||||
static dnnl::engine engine(dnnl::engine::kind::cpu, 0);
|
static dnnl::engine engine(dnnl::engine::kind::cpu, 0);
|
||||||
@@ -274,7 +274,7 @@ void W8A8MatMulPrimitiveHandler::execute(ExecArgs& args) {
|
|||||||
|
|
||||||
auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(5);
|
auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(5);
|
||||||
scratchpad_storage->set_data_handle(
|
scratchpad_storage->set_data_handle(
|
||||||
cpu_utils::ScratchPadManager::get_scratchpad_manager()->get_data<void>());
|
DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<void>());
|
||||||
|
|
||||||
matmul.execute(default_stream(), memory_cache_);
|
matmul.execute(default_stream(), memory_cache_);
|
||||||
default_stream().wait();
|
default_stream().wait();
|
||||||
@@ -294,7 +294,7 @@ dnnl::matmul W8A8MatMulPrimitiveHandler::get_matmul_cache(
|
|||||||
|
|
||||||
return m_size_cache_->get_or_create(key, [&]() {
|
return m_size_cache_->get_or_create(key, [&]() {
|
||||||
dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
|
dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
|
||||||
auto manager = cpu_utils::ScratchPadManager::get_scratchpad_manager();
|
auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager();
|
||||||
manager->realloc(desc.scratchpad_desc().get_size());
|
manager->realloc(desc.scratchpad_desc().get_size());
|
||||||
return dnnl::matmul(desc);
|
return dnnl::matmul(desc);
|
||||||
});
|
});
|
||||||
@@ -470,7 +470,7 @@ void MatMulPrimitiveHandler::execute(ExecArgs& args) {
|
|||||||
|
|
||||||
auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(3);
|
auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(3);
|
||||||
scratchpad_storage->set_data_handle(
|
scratchpad_storage->set_data_handle(
|
||||||
cpu_utils::ScratchPadManager::get_scratchpad_manager()->get_data<void>());
|
DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<void>());
|
||||||
|
|
||||||
matmul.execute(default_stream(), memory_cache_);
|
matmul.execute(default_stream(), memory_cache_);
|
||||||
default_stream().wait();
|
default_stream().wait();
|
||||||
@@ -486,7 +486,7 @@ dnnl::matmul MatMulPrimitiveHandler::get_matmul_cache(
|
|||||||
}
|
}
|
||||||
return m_size_cache_->get_or_create(key, [&]() {
|
return m_size_cache_->get_or_create(key, [&]() {
|
||||||
dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
|
dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
|
||||||
auto manager = cpu_utils::ScratchPadManager::get_scratchpad_manager();
|
auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager();
|
||||||
manager->realloc(desc.scratchpad_desc().get_size());
|
manager->realloc(desc.scratchpad_desc().get_size());
|
||||||
return dnnl::matmul(desc);
|
return dnnl::matmul(desc);
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -235,39 +235,6 @@ class MicroGemm<cpu_utils::ISA::AMX, scalar_t> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void pack_weight(const scalar_t* __restrict__ weight,
|
|
||||||
scalar_t* __restrict__ packed_weight,
|
|
||||||
const int32_t output_size, const int32_t input_size) {
|
|
||||||
constexpr int32_t elem_num_per_group = 4 / sizeof(scalar_t);
|
|
||||||
TORCH_CHECK_EQ(output_size % 16, 0);
|
|
||||||
TORCH_CHECK_EQ(input_size % (16 * elem_num_per_group), 0);
|
|
||||||
|
|
||||||
const int32_t output_group_num = output_size / 16;
|
|
||||||
const int32_t input_32b_num = input_size / elem_num_per_group;
|
|
||||||
for (int32_t output_group_idx = 0; output_group_idx < output_group_num;
|
|
||||||
++output_group_idx) {
|
|
||||||
const int32_t* __restrict__ weight_32b =
|
|
||||||
reinterpret_cast<const int32_t*>(weight);
|
|
||||||
int32_t* __restrict__ packed_weight_32b =
|
|
||||||
reinterpret_cast<int32_t*>(packed_weight);
|
|
||||||
for (int32_t output_idx = 0; output_idx < 16; ++output_idx) {
|
|
||||||
for (int32_t weight_offset = 0, packed_offset = 0;
|
|
||||||
weight_offset < input_32b_num;
|
|
||||||
++weight_offset, packed_offset += 16) {
|
|
||||||
packed_weight_32b[packed_offset] = weight_32b[weight_offset];
|
|
||||||
}
|
|
||||||
|
|
||||||
// update
|
|
||||||
weight_32b += input_32b_num;
|
|
||||||
packed_weight_32b += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// update
|
|
||||||
weight += 16 * input_size;
|
|
||||||
packed_weight += 16 * input_size;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
alignas(64) __tilecfg amx_tile_config_;
|
alignas(64) __tilecfg amx_tile_config_;
|
||||||
int32_t curr_m_;
|
int32_t curr_m_;
|
||||||
|
|||||||
@@ -13,9 +13,6 @@ namespace cpu_micro_gemm {
|
|||||||
#define CPU_MICRO_GEMM_PARAMS \
|
#define CPU_MICRO_GEMM_PARAMS \
|
||||||
a_ptr, b_ptr, c_ptr, m, k, lda, b_n_group_stride, ldc, accum_c
|
a_ptr, b_ptr, c_ptr, m, k, lda, b_n_group_stride, ldc, accum_c
|
||||||
|
|
||||||
// Note: weights for MicroGemm should be packed as (output_size / 16) contiguous
|
|
||||||
// blocks, means the logical shape of blocks is [16, input_size]. And the actual
|
|
||||||
// layout of blocks can be ISA-specific.
|
|
||||||
template <cpu_utils::ISA isa, typename scalar_t>
|
template <cpu_utils::ISA isa, typename scalar_t>
|
||||||
class MicroGemm {
|
class MicroGemm {
|
||||||
public:
|
public:
|
||||||
@@ -89,41 +86,6 @@ FORCE_INLINE void bias_epilogue(float* __restrict__ c_ptr,
|
|||||||
curr_d += ldd;
|
curr_d += ldd;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int32_t n_size, typename scalar_t>
|
|
||||||
FORCE_INLINE void add_bias_epilogue(float* c_ptr, float* d_ptr,
|
|
||||||
scalar_t* __restrict__ bias_ptr,
|
|
||||||
const int32_t m, const int64_t ldc,
|
|
||||||
const int64_t ldd) {
|
|
||||||
using scalar_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
|
|
||||||
static_assert(n_size % 16 == 0);
|
|
||||||
constexpr int32_t n_group_num = n_size / 16;
|
|
||||||
static_assert(n_group_num <= 16);
|
|
||||||
|
|
||||||
vec_op::FP32Vec16 bias_vecs[n_group_num];
|
|
||||||
scalar_t* __restrict__ curr_bias = bias_ptr;
|
|
||||||
vec_op::unroll_loop<int32_t, n_group_num>([&](int32_t i) {
|
|
||||||
scalar_vec_t vec(curr_bias);
|
|
||||||
bias_vecs[i] = vec_op::FP32Vec16(vec);
|
|
||||||
curr_bias += 16;
|
|
||||||
});
|
|
||||||
|
|
||||||
float* curr_c = c_ptr;
|
|
||||||
float* curr_d = d_ptr;
|
|
||||||
for (int32_t i = 0; i < m; ++i) {
|
|
||||||
float* curr_c_iter = curr_c;
|
|
||||||
float* curr_d_iter = curr_d;
|
|
||||||
vec_op::unroll_loop<int32_t, n_group_num>([&](int32_t n_g_idx) {
|
|
||||||
vec_op::FP32Vec16 c_vec_fp32(curr_c_iter);
|
|
||||||
c_vec_fp32 = c_vec_fp32 + bias_vecs[n_g_idx];
|
|
||||||
c_vec_fp32.save(curr_d_iter);
|
|
||||||
curr_c_iter += 16;
|
|
||||||
curr_d_iter += 16;
|
|
||||||
});
|
|
||||||
curr_c += ldc;
|
|
||||||
curr_d += ldd;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} // namespace cpu_micro_gemm
|
} // namespace cpu_micro_gemm
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -109,25 +109,6 @@ class MicroGemm<cpu_utils::ISA::VEC, scalar_t> {
|
|||||||
void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
|
void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
|
||||||
TileGemm82<scalar_t>::gemm(CPU_MICRO_GEMM_PARAMS);
|
TileGemm82<scalar_t>::gemm(CPU_MICRO_GEMM_PARAMS);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Note: pack contiguous weight [output_size, input_size] as contiguous
|
|
||||||
// packed weight [output_size / 16, input_size, 16]
|
|
||||||
static void pack_weight(const scalar_t* __restrict__ weight,
|
|
||||||
scalar_t* __restrict__ packed_weight,
|
|
||||||
const int32_t output_size, const int32_t input_size) {
|
|
||||||
TORCH_CHECK_EQ(output_size % 16, 0);
|
|
||||||
for (int32_t o_idx = 0; o_idx < output_size; ++o_idx) {
|
|
||||||
const scalar_t* __restrict__ curr_weight = weight + o_idx * input_size;
|
|
||||||
scalar_t* __restrict__ curr_packed_weight =
|
|
||||||
packed_weight + (o_idx / 16) * (16 * input_size) + o_idx % 16;
|
|
||||||
for (int32_t i_idx = 0; i_idx < input_size; ++i_idx) {
|
|
||||||
*curr_packed_weight = *curr_weight;
|
|
||||||
|
|
||||||
curr_packed_weight += 16;
|
|
||||||
++curr_weight;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
} // namespace cpu_micro_gemm
|
} // namespace cpu_micro_gemm
|
||||||
|
|
||||||
|
|||||||
23
csrc/cpu/scratchpad_manager.cpp
Normal file
23
csrc/cpu/scratchpad_manager.cpp
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
#include <cstdlib>
|
||||||
|
|
||||||
|
#include "scratchpad_manager.h"
|
||||||
|
|
||||||
|
DNNLScratchPadManager::DNNLScratchPadManager() : size_(0), ptr_(nullptr) {
|
||||||
|
this->realloc(allocation_unit * 128);
|
||||||
|
}
|
||||||
|
|
||||||
|
void DNNLScratchPadManager::realloc(size_t new_size) {
|
||||||
|
new_size = round(new_size);
|
||||||
|
if (new_size > size_) {
|
||||||
|
if (ptr_ != nullptr) {
|
||||||
|
std::free(ptr_);
|
||||||
|
}
|
||||||
|
ptr_ = std::aligned_alloc(64, new_size);
|
||||||
|
size_ = new_size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
DNNLScratchPadManager* DNNLScratchPadManager::get_dnnl_scratchpad_manager() {
|
||||||
|
static DNNLScratchPadManager manager;
|
||||||
|
return &manager;
|
||||||
|
}
|
||||||
31
csrc/cpu/scratchpad_manager.h
Normal file
31
csrc/cpu/scratchpad_manager.h
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
#ifndef SCRATCHPAD_MANAGER_H
|
||||||
|
#define SCRATCHPAD_MANAGER_H
|
||||||
|
|
||||||
|
#include <cstddef>
|
||||||
|
#include <cstdio>
|
||||||
|
|
||||||
|
class DNNLScratchPadManager {
|
||||||
|
public:
|
||||||
|
static constexpr size_t allocation_unit = 4 * 1024; // 4KB
|
||||||
|
|
||||||
|
static DNNLScratchPadManager* get_dnnl_scratchpad_manager();
|
||||||
|
|
||||||
|
DNNLScratchPadManager();
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
T* get_data() {
|
||||||
|
return reinterpret_cast<T*>(ptr_);
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t round(size_t size) {
|
||||||
|
return ((size + allocation_unit - 1) / allocation_unit) * allocation_unit;
|
||||||
|
}
|
||||||
|
|
||||||
|
void realloc(size_t new_size);
|
||||||
|
|
||||||
|
private:
|
||||||
|
size_t size_;
|
||||||
|
void* ptr_;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
||||||
@@ -110,17 +110,6 @@ void cpu_gemm_wna16(const torch::Tensor& input, const torch::Tensor& q_weight,
|
|||||||
const std::optional<torch::Tensor>& bias,
|
const std::optional<torch::Tensor>& bias,
|
||||||
const int64_t pack_factor, const std::string& isa_hint);
|
const int64_t pack_factor, const std::string& isa_hint);
|
||||||
|
|
||||||
void prepack_moe_weight(const torch::Tensor& weight,
|
|
||||||
torch::Tensor& packed_weight, const std::string& isa);
|
|
||||||
|
|
||||||
void cpu_fused_moe(torch::Tensor& output, const torch::Tensor& input,
|
|
||||||
const torch::Tensor& w13, const torch::Tensor& w2,
|
|
||||||
const std::optional<torch::Tensor>& w13_bias,
|
|
||||||
const std::optional<torch::Tensor>& w2_bias,
|
|
||||||
const torch::Tensor& topk_weights,
|
|
||||||
const torch::Tensor& topk_id, const std::string& act,
|
|
||||||
const std::string& isa);
|
|
||||||
|
|
||||||
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||||
// vLLM custom ops
|
// vLLM custom ops
|
||||||
|
|
||||||
@@ -307,19 +296,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
"pack_factor, str isa_hint) -> ()");
|
"pack_factor, str isa_hint) -> ()");
|
||||||
ops.impl("cpu_gemm_wna16", torch::kCPU, &cpu_gemm_wna16);
|
ops.impl("cpu_gemm_wna16", torch::kCPU, &cpu_gemm_wna16);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// fused moe
|
|
||||||
#if defined(__AVX512F__)
|
|
||||||
ops.def(
|
|
||||||
"prepack_moe_weight(Tensor weight, Tensor(a1!) packed_weight, str isa) "
|
|
||||||
"-> ()");
|
|
||||||
ops.impl("prepack_moe_weight", torch::kCPU, &prepack_moe_weight);
|
|
||||||
ops.def(
|
|
||||||
"cpu_fused_moe(Tensor(a0!) output, Tensor input, Tensor w13, Tensor w2, "
|
|
||||||
"Tensor? w13_bias, Tensor? w2_bias, Tensor topk_weights, Tensor topk_id, "
|
|
||||||
"str act, str isa) -> ()");
|
|
||||||
ops.impl("cpu_fused_moe", torch::kCPU, &cpu_fused_moe);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
|
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
|
||||||
|
|||||||
@@ -10,7 +10,7 @@
|
|||||||
#define gettid() syscall(SYS_gettid)
|
#define gettid() syscall(SYS_gettid)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "cpu/utils.hpp"
|
#include "cpu_types.hpp"
|
||||||
|
|
||||||
#ifdef VLLM_NUMA_DISABLED
|
#ifdef VLLM_NUMA_DISABLED
|
||||||
std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
||||||
@@ -24,8 +24,6 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
|||||||
#ifndef VLLM_NUMA_DISABLED
|
#ifndef VLLM_NUMA_DISABLED
|
||||||
std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
||||||
bitmask* omp_cpu_mask = numa_parse_cpustring_all(cpu_ids.c_str());
|
bitmask* omp_cpu_mask = numa_parse_cpustring_all(cpu_ids.c_str());
|
||||||
TORCH_CHECK(omp_cpu_mask != nullptr,
|
|
||||||
"Failed to parse CPU string: " + cpu_ids);
|
|
||||||
TORCH_CHECK(omp_cpu_mask->size > 0);
|
TORCH_CHECK(omp_cpu_mask->size > 0);
|
||||||
std::vector<int> omp_cpu_ids;
|
std::vector<int> omp_cpu_ids;
|
||||||
omp_cpu_ids.reserve(omp_cpu_mask->size);
|
omp_cpu_ids.reserve(omp_cpu_mask->size);
|
||||||
@@ -46,12 +44,20 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
|||||||
|
|
||||||
// Memory node binding
|
// Memory node binding
|
||||||
if (numa_available() != -1) {
|
if (numa_available() != -1) {
|
||||||
|
int mem_node_id = numa_node_of_cpu(omp_cpu_ids.front());
|
||||||
std::set<int> node_ids;
|
std::set<int> node_ids;
|
||||||
for (const auto& cpu_id : omp_cpu_ids) {
|
for (const auto& cpu_id : omp_cpu_ids) {
|
||||||
int node_id = numa_node_of_cpu(cpu_id);
|
int node_id = numa_node_of_cpu(cpu_id);
|
||||||
if (node_id != -1) {
|
if (node_id != -1) {
|
||||||
node_ids.insert(node_id);
|
node_ids.insert(node_id);
|
||||||
}
|
}
|
||||||
|
if (node_id != mem_node_id) {
|
||||||
|
TORCH_WARN("CPU ", cpu_id, " is on NUMA node ", node_id, ", but CPU ",
|
||||||
|
omp_cpu_ids.front(), " is on NUMA node ", mem_node_id,
|
||||||
|
". All CPUs should be on the same NUMA node for optimal "
|
||||||
|
"performance. Memory will be bound to NUMA node ",
|
||||||
|
mem_node_id, ".");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// Concatenate all node_ids into a single comma-separated string
|
// Concatenate all node_ids into a single comma-separated string
|
||||||
if (!node_ids.empty()) {
|
if (!node_ids.empty()) {
|
||||||
@@ -64,7 +70,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bitmask* mask = numa_parse_nodestring(node_ids_str.c_str());
|
bitmask* mask = numa_parse_nodestring(node_ids_str.c_str());
|
||||||
bitmask* src_mask = numa_get_mems_allowed();
|
bitmask* src_mask = numa_get_membind();
|
||||||
|
|
||||||
int pid = getpid();
|
int pid = getpid();
|
||||||
|
|
||||||
@@ -77,46 +83,15 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
|||||||
std::to_string(errno));
|
std::to_string(errno));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Restrict memory allocation to the selected NUMA node(s).
|
// restrict memory allocation node.
|
||||||
// Enhances memory locality for the threads bound to those NUMA CPUs.
|
numa_set_membind(mask);
|
||||||
if (node_ids.size() > 1) {
|
|
||||||
errno = 0;
|
|
||||||
numa_set_interleave_mask(mask);
|
|
||||||
if (errno != 0) {
|
|
||||||
TORCH_WARN("numa_set_interleave_mask failed. errno: " +
|
|
||||||
std::to_string(errno));
|
|
||||||
} else {
|
|
||||||
TORCH_WARN(
|
|
||||||
"NUMA binding: Using INTERLEAVE policy for memory "
|
|
||||||
"allocation across multiple NUMA nodes (nodes: " +
|
|
||||||
node_ids_str +
|
|
||||||
"). Memory allocations will be "
|
|
||||||
"interleaved across the specified NUMA nodes.");
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
errno = 0;
|
|
||||||
numa_set_membind(mask);
|
|
||||||
if (errno != 0) {
|
|
||||||
TORCH_WARN("numa_set_membind failed. errno: " +
|
|
||||||
std::to_string(errno));
|
|
||||||
} else {
|
|
||||||
TORCH_WARN(
|
|
||||||
"NUMA binding: Using MEMBIND policy for memory "
|
|
||||||
"allocation on the NUMA nodes (" +
|
|
||||||
node_ids_str +
|
|
||||||
"). Memory allocations will be "
|
|
||||||
"strictly bound to these NUMA nodes.");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
numa_set_strict(1);
|
numa_set_strict(1);
|
||||||
|
|
||||||
numa_free_nodemask(mask);
|
numa_free_nodemask(mask);
|
||||||
numa_free_nodemask(src_mask);
|
numa_free_nodemask(src_mask);
|
||||||
} else {
|
} else {
|
||||||
TORCH_WARN(
|
TORCH_WARN("numa_parse_nodestring or numa_get_membind failed. errno: " +
|
||||||
"numa_parse_nodestring or numa_get_run_node_mask failed. errno: " +
|
std::to_string(errno));
|
||||||
std::to_string(errno));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -163,26 +138,4 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
|||||||
|
|
||||||
return ss.str();
|
return ss.str();
|
||||||
}
|
}
|
||||||
#endif // VLLM_NUMA_DISABLED
|
#endif
|
||||||
|
|
||||||
namespace cpu_utils {
|
|
||||||
ScratchPadManager::ScratchPadManager() : size_(0), ptr_(nullptr) {
|
|
||||||
this->realloc(allocation_unit * 128);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ScratchPadManager::realloc(size_t new_size) {
|
|
||||||
new_size = round(new_size);
|
|
||||||
if (new_size > size_) {
|
|
||||||
if (ptr_ != nullptr) {
|
|
||||||
std::free(ptr_);
|
|
||||||
}
|
|
||||||
ptr_ = std::aligned_alloc(64, new_size);
|
|
||||||
size_ = new_size;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
ScratchPadManager* ScratchPadManager::get_scratchpad_manager() {
|
|
||||||
static ScratchPadManager manager;
|
|
||||||
return &manager;
|
|
||||||
}
|
|
||||||
} // namespace cpu_utils
|
|
||||||
|
|||||||
@@ -2,24 +2,19 @@
|
|||||||
#define UTILS_HPP
|
#define UTILS_HPP
|
||||||
|
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
|
#include <cassert>
|
||||||
|
#include <cstdint>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#include <ATen/cpu/Utils.h>
|
|
||||||
|
|
||||||
#include "cpu/cpu_types.hpp"
|
#if defined(__APPLE__)
|
||||||
|
#include <sys/sysctl.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "cpu_types.hpp"
|
||||||
|
|
||||||
namespace cpu_utils {
|
namespace cpu_utils {
|
||||||
enum class ISA { AMX, VEC };
|
enum class ISA { AMX, VEC };
|
||||||
|
|
||||||
inline ISA get_isa(const std::string& isa) {
|
|
||||||
if (isa == "amx") {
|
|
||||||
return ISA::AMX;
|
|
||||||
} else if (isa == "vec") {
|
|
||||||
return ISA::VEC;
|
|
||||||
} else {
|
|
||||||
TORCH_CHECK(false, "Invalid isa type: " + isa);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
struct VecTypeTrait {
|
struct VecTypeTrait {
|
||||||
using vec_t = void;
|
using vec_t = void;
|
||||||
@@ -37,12 +32,10 @@ struct VecTypeTrait<c10::BFloat16> {
|
|||||||
};
|
};
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if !defined(__powerpc__)
|
|
||||||
template <>
|
template <>
|
||||||
struct VecTypeTrait<c10::Half> {
|
struct VecTypeTrait<c10::Half> {
|
||||||
using vec_t = vec_op::FP16Vec16;
|
using vec_t = vec_op::FP16Vec16;
|
||||||
};
|
};
|
||||||
#endif
|
|
||||||
|
|
||||||
struct Counter {
|
struct Counter {
|
||||||
std::atomic<int64_t> counter;
|
std::atomic<int64_t> counter;
|
||||||
@@ -55,66 +48,26 @@ struct Counter {
|
|||||||
int64_t acquire_counter() { return counter++; }
|
int64_t acquire_counter() { return counter++; }
|
||||||
};
|
};
|
||||||
|
|
||||||
inline int64_t get_available_l2_size() {
|
inline int64_t get_l2_size() {
|
||||||
static int64_t size = []() {
|
static int64_t size = []() {
|
||||||
const uint32_t l2_cache_size = at::cpu::L2_cache_size();
|
#if defined(__APPLE__)
|
||||||
|
// macOS doesn't have _SC_LEVEL2_CACHE_SIZE. Use sysctlbyname.
|
||||||
|
int64_t l2_cache_size = 0;
|
||||||
|
size_t len = sizeof(l2_cache_size);
|
||||||
|
if (sysctlbyname("hw.l2cachesize", &l2_cache_size, &len, NULL, 0) == 0 &&
|
||||||
|
l2_cache_size > 0) {
|
||||||
|
return l2_cache_size >> 1; // use 50% of L2 cache
|
||||||
|
}
|
||||||
|
// Fallback if sysctlbyname fails
|
||||||
|
return 128LL * 1024 >> 1; // use 50% of 128KB
|
||||||
|
#else
|
||||||
|
long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE);
|
||||||
|
assert(l2_cache_size != -1);
|
||||||
return l2_cache_size >> 1; // use 50% of L2 cache
|
return l2_cache_size >> 1; // use 50% of L2 cache
|
||||||
|
#endif
|
||||||
}();
|
}();
|
||||||
return size;
|
return size;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int32_t alignment_v, typename T>
|
|
||||||
inline T round_up(T size) {
|
|
||||||
T alignment = alignment_v;
|
|
||||||
return (((size + alignment - 1) / alignment) * alignment);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <int32_t alignment_v, typename T>
|
|
||||||
inline T round_down(T size) {
|
|
||||||
T alignment = alignment_v;
|
|
||||||
return (size / alignment) * alignment;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
inline void print_logits(const char* name, T* ptr, int32_t row, int32_t col,
|
|
||||||
int32_t stride) {
|
|
||||||
std::stringstream ss;
|
|
||||||
ss << std::fixed << std::setprecision(5) << name << ": [\n";
|
|
||||||
auto* curr_logits_buffer = ptr;
|
|
||||||
for (int32_t m = 0; m < row; ++m) {
|
|
||||||
for (int32_t n = 0; n < col; ++n) {
|
|
||||||
ss << curr_logits_buffer[n] << ", ";
|
|
||||||
}
|
|
||||||
ss << "\n";
|
|
||||||
curr_logits_buffer += stride;
|
|
||||||
}
|
|
||||||
ss << "]\n";
|
|
||||||
std::printf("%s", ss.str().c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
class ScratchPadManager {
|
|
||||||
public:
|
|
||||||
static constexpr size_t allocation_unit = 4 * 1024; // 4KB
|
|
||||||
|
|
||||||
static ScratchPadManager* get_scratchpad_manager();
|
|
||||||
|
|
||||||
ScratchPadManager();
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
T* get_data() {
|
|
||||||
return reinterpret_cast<T*>(ptr_);
|
|
||||||
}
|
|
||||||
|
|
||||||
static size_t round(size_t size) {
|
|
||||||
return ((size + allocation_unit - 1) / allocation_unit) * allocation_unit;
|
|
||||||
}
|
|
||||||
|
|
||||||
void realloc(size_t new_size);
|
|
||||||
|
|
||||||
private:
|
|
||||||
size_t size_;
|
|
||||||
void* ptr_;
|
|
||||||
};
|
|
||||||
} // namespace cpu_utils
|
} // namespace cpu_utils
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -107,16 +107,6 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
|
|||||||
prop.location.id = device;
|
prop.location.id = device;
|
||||||
prop.allocFlags.compressionType = CU_MEM_ALLOCATION_COMP_NONE;
|
prop.allocFlags.compressionType = CU_MEM_ALLOCATION_COMP_NONE;
|
||||||
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
int flag = 0;
|
|
||||||
CUDA_CHECK(cuDeviceGetAttribute(
|
|
||||||
&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
|
|
||||||
device));
|
|
||||||
if (flag) { // support GPUDirect RDMA if possible
|
|
||||||
prop.allocFlags.gpuDirectRDMACapable = 1;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef USE_ROCM
|
#ifndef USE_ROCM
|
||||||
// Allocate memory using cuMemCreate
|
// Allocate memory using cuMemCreate
|
||||||
CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0));
|
CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0));
|
||||||
|
|||||||
@@ -107,8 +107,7 @@ __global__ void fusedQKNormRopeKernel(
|
|||||||
void const* k_weight_void, // RMSNorm weights for key
|
void const* k_weight_void, // RMSNorm weights for key
|
||||||
void const* cos_sin_cache_void, // Pre-computed cos/sin cache
|
void const* cos_sin_cache_void, // Pre-computed cos/sin cache
|
||||||
int64_t const* position_ids, // Position IDs for RoPE
|
int64_t const* position_ids, // Position IDs for RoPE
|
||||||
int const num_tokens, // Number of tokens
|
int const num_tokens // Number of tokens
|
||||||
int const rotary_dim // Dimension for RoPE
|
|
||||||
) {
|
) {
|
||||||
#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 800) && !defined(USE_ROCM)
|
#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 800) && !defined(USE_ROCM)
|
||||||
if constexpr ((std::is_same_v<scalar_t_in, c10::BFloat16>) ||
|
if constexpr ((std::is_same_v<scalar_t_in, c10::BFloat16>) ||
|
||||||
@@ -228,59 +227,56 @@ __global__ void fusedQKNormRopeKernel(
|
|||||||
|
|
||||||
// Calculate cache pointer for this position - similar to
|
// Calculate cache pointer for this position - similar to
|
||||||
// pos_encoding_kernels.cu
|
// pos_encoding_kernels.cu
|
||||||
T_cache const* cache_ptr = cos_sin_cache + pos_id * rotary_dim;
|
T_cache const* cache_ptr = cos_sin_cache + pos_id * head_dim;
|
||||||
int const embed_dim = rotary_dim / 2;
|
int const embed_dim = head_dim / 2;
|
||||||
T_cache const* cos_ptr = cache_ptr;
|
T_cache const* cos_ptr = cache_ptr;
|
||||||
T_cache const* sin_ptr = cache_ptr + embed_dim;
|
T_cache const* sin_ptr = cache_ptr + embed_dim;
|
||||||
int const rotary_lanes = rotary_dim / numElemsPerThread; // rotary range
|
|
||||||
if (laneId < rotary_lanes) {
|
if constexpr (interleave) {
|
||||||
if constexpr (interleave) {
|
// Perform interleaving. Use pre-computed cos/sin values.
|
||||||
// Perform interleaving. Use pre-computed cos/sin values.
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < numElemsPerThread / 2; ++i) {
|
for (int i = 0; i < numElemsPerThread / 2; ++i) {
|
||||||
int const idx0 = 2 * i;
|
int const idx0 = 2 * i;
|
||||||
int const idx1 = 2 * i + 1;
|
int const idx1 = 2 * i + 1;
|
||||||
// Global dimension index in the head
|
|
||||||
int const dim_idx = laneId * numElemsPerThread + idx0;
|
|
||||||
|
|
||||||
float const val0 = elements[idx0];
|
float const val0 = elements[idx0];
|
||||||
float const val1 = elements[idx1];
|
float const val1 = elements[idx1];
|
||||||
|
|
||||||
int const half_dim = dim_idx / 2;
|
int const dim_idx = laneId * numElemsPerThread + idx0;
|
||||||
float const cos_val =
|
int const half_dim = dim_idx / 2;
|
||||||
CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
|
float const cos_val =
|
||||||
float const sin_val =
|
CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
|
||||||
CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
|
float const sin_val =
|
||||||
|
CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
|
||||||
|
|
||||||
elements[idx0] = val0 * cos_val - val1 * sin_val;
|
elements[idx0] = val0 * cos_val - val1 * sin_val;
|
||||||
elements[idx1] = val0 * sin_val + val1 * cos_val;
|
elements[idx1] = val0 * sin_val + val1 * cos_val;
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Before data exchange with in warp, we need to sync.
|
|
||||||
__syncwarp();
|
|
||||||
int pairOffset = (rotary_dim / 2) / numElemsPerThread;
|
|
||||||
// Get the data from the other half of the warp. Use pre-computed
|
|
||||||
// cos/sin values.
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < numElemsPerThread; i++) {
|
|
||||||
elements2[i] = __shfl_xor_sync(FINAL_MASK, elements[i], pairOffset);
|
|
||||||
|
|
||||||
if (laneId < pairOffset) {
|
|
||||||
elements2[i] = -elements2[i];
|
|
||||||
}
|
|
||||||
int dim_idx = laneId * numElemsPerThread + i;
|
|
||||||
|
|
||||||
dim_idx = (dim_idx * 2) % rotary_dim;
|
|
||||||
int half_dim = dim_idx / 2;
|
|
||||||
float cos_val = CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
|
|
||||||
float sin_val = CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
|
|
||||||
|
|
||||||
elements[i] = elements[i] * cos_val + elements2[i] * sin_val;
|
|
||||||
}
|
|
||||||
// __shfl_xor_sync does not provide memfence. Need to sync again.
|
|
||||||
__syncwarp();
|
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
// Before data exchange with in warp, we need to sync.
|
||||||
|
__syncwarp();
|
||||||
|
// Get the data from the other half of the warp. Use pre-computed cos/sin
|
||||||
|
// values.
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < numElemsPerThread; i++) {
|
||||||
|
elements2[i] = __shfl_xor_sync(FINAL_MASK, elements[i], 16);
|
||||||
|
if (laneId < 16) {
|
||||||
|
elements2[i] = -elements2[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
int dim_idx = laneId * numElemsPerThread + i;
|
||||||
|
dim_idx = (dim_idx * 2) % head_dim;
|
||||||
|
int half_dim = dim_idx / 2;
|
||||||
|
// Use pre-computed cos/sin from cache
|
||||||
|
float cos_val = CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
|
||||||
|
float sin_val = CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
|
||||||
|
|
||||||
|
elements[i] = elements[i] * cos_val + elements2[i] * sin_val;
|
||||||
|
}
|
||||||
|
// __shfl_xor_sync does not provide memfence. Need to sync again.
|
||||||
|
__syncwarp();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Store.
|
// Store.
|
||||||
{
|
{
|
||||||
vec_T vec;
|
vec_T vec;
|
||||||
@@ -316,10 +312,10 @@ template <typename scalar_t_in, typename scalar_t_cache>
|
|||||||
void launchFusedQKNormRope(void* qkv, int const num_tokens,
|
void launchFusedQKNormRope(void* qkv, int const num_tokens,
|
||||||
int const num_heads_q, int const num_heads_k,
|
int const num_heads_q, int const num_heads_k,
|
||||||
int const num_heads_v, int const head_dim,
|
int const num_heads_v, int const head_dim,
|
||||||
int const rotary_dim, float const eps,
|
float const eps, void const* q_weight,
|
||||||
void const* q_weight, void const* k_weight,
|
void const* k_weight, void const* cos_sin_cache,
|
||||||
void const* cos_sin_cache, bool const interleave,
|
bool const interleave, int64_t const* position_ids,
|
||||||
int64_t const* position_ids, cudaStream_t stream) {
|
cudaStream_t stream) {
|
||||||
constexpr int blockSize = 256;
|
constexpr int blockSize = 256;
|
||||||
|
|
||||||
int const warpsPerBlock = blockSize / 32;
|
int const warpsPerBlock = blockSize / 32;
|
||||||
@@ -336,7 +332,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens,
|
|||||||
fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 64, INTERLEAVE>
|
fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 64, INTERLEAVE>
|
||||||
<<<gridDim, blockDim, 0, stream>>>(
|
<<<gridDim, blockDim, 0, stream>>>(
|
||||||
qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
|
qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
|
||||||
k_weight, cos_sin_cache, position_ids, num_tokens, rotary_dim);
|
k_weight, cos_sin_cache, position_ids, num_tokens);
|
||||||
});
|
});
|
||||||
break;
|
break;
|
||||||
case 128:
|
case 128:
|
||||||
@@ -344,7 +340,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens,
|
|||||||
fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 128, INTERLEAVE>
|
fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 128, INTERLEAVE>
|
||||||
<<<gridDim, blockDim, 0, stream>>>(
|
<<<gridDim, blockDim, 0, stream>>>(
|
||||||
qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
|
qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
|
||||||
k_weight, cos_sin_cache, position_ids, num_tokens, rotary_dim);
|
k_weight, cos_sin_cache, position_ids, num_tokens);
|
||||||
});
|
});
|
||||||
break;
|
break;
|
||||||
case 256:
|
case 256:
|
||||||
@@ -352,7 +348,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens,
|
|||||||
fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 256, INTERLEAVE>
|
fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 256, INTERLEAVE>
|
||||||
<<<gridDim, blockDim, 0, stream>>>(
|
<<<gridDim, blockDim, 0, stream>>>(
|
||||||
qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
|
qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
|
||||||
k_weight, cos_sin_cache, position_ids, num_tokens, rotary_dim);
|
k_weight, cos_sin_cache, position_ids, num_tokens);
|
||||||
});
|
});
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
@@ -396,11 +392,8 @@ void fused_qk_norm_rope(
|
|||||||
"Query weights size must match head dimension");
|
"Query weights size must match head dimension");
|
||||||
TORCH_CHECK(k_weight.size(0) == head_dim,
|
TORCH_CHECK(k_weight.size(0) == head_dim,
|
||||||
"Key weights size must match head dimension");
|
"Key weights size must match head dimension");
|
||||||
|
TORCH_CHECK(cos_sin_cache.size(1) == head_dim,
|
||||||
TORCH_CHECK(cos_sin_cache.size(1) % 2 == 0, "rotary_dim must be even");
|
"Cos/sin cache dimension must match head_dim");
|
||||||
TORCH_CHECK(cos_sin_cache.size(1) <= head_dim,
|
|
||||||
"rotary_dim must be less than or equal to head_dim");
|
|
||||||
|
|
||||||
TORCH_CHECK(qkv.scalar_type() == q_weight.scalar_type() &&
|
TORCH_CHECK(qkv.scalar_type() == q_weight.scalar_type() &&
|
||||||
qkv.scalar_type() == k_weight.scalar_type(),
|
qkv.scalar_type() == k_weight.scalar_type(),
|
||||||
"qkv, q_weight and k_weight must have the same dtype");
|
"qkv, q_weight and k_weight must have the same dtype");
|
||||||
@@ -426,8 +419,7 @@ void fused_qk_norm_rope(
|
|||||||
qkv.data_ptr(), static_cast<int>(num_tokens),
|
qkv.data_ptr(), static_cast<int>(num_tokens),
|
||||||
static_cast<int>(num_heads_q), static_cast<int>(num_heads_k),
|
static_cast<int>(num_heads_q), static_cast<int>(num_heads_k),
|
||||||
static_cast<int>(num_heads_v), static_cast<int>(head_dim),
|
static_cast<int>(num_heads_v), static_cast<int>(head_dim),
|
||||||
static_cast<int>(cos_sin_cache.size(1)), static_cast<float>(eps),
|
static_cast<float>(eps), q_weight.data_ptr(), k_weight.data_ptr(),
|
||||||
q_weight.data_ptr(), k_weight.data_ptr(),
|
|
||||||
cos_sin_cache.data_ptr(), !is_neox,
|
cos_sin_cache.data_ptr(), !is_neox,
|
||||||
reinterpret_cast<int64_t const*>(position_ids.data_ptr()),
|
reinterpret_cast<int64_t const*>(position_ids.data_ptr()),
|
||||||
stream);
|
stream);
|
||||||
|
|||||||
@@ -446,19 +446,15 @@ __device__ inline T apply_sigmoid(T val) {
|
|||||||
|
|
||||||
template <ScoringFunc SF, typename T>
|
template <ScoringFunc SF, typename T>
|
||||||
__device__ inline T apply_scoring(T val) {
|
__device__ inline T apply_scoring(T val) {
|
||||||
if constexpr (SF == SCORING_NONE) {
|
if constexpr (SF == SCORING_SIGMOID) {
|
||||||
return val;
|
|
||||||
} else if constexpr (SF == SCORING_SIGMOID) {
|
|
||||||
return apply_sigmoid(val);
|
return apply_sigmoid(val);
|
||||||
} else {
|
} else {
|
||||||
static_assert(SF == SCORING_NONE || SF == SCORING_SIGMOID,
|
|
||||||
"Unsupported ScoringFunc in apply_scoring");
|
|
||||||
return val;
|
return val;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T, typename BiasT, ScoringFunc SF>
|
template <typename T, ScoringFunc SF>
|
||||||
__device__ void topk_with_k2(T* output, T const* input, BiasT const* bias,
|
__device__ void topk_with_k2(T* output, T const* input, T const* bias,
|
||||||
cg::thread_block_tile<32> const& tile,
|
cg::thread_block_tile<32> const& tile,
|
||||||
int32_t const lane_id,
|
int32_t const lane_id,
|
||||||
int const num_experts_per_group) {
|
int const num_experts_per_group) {
|
||||||
@@ -469,7 +465,7 @@ __device__ void topk_with_k2(T* output, T const* input, BiasT const* bias,
|
|||||||
if (num_experts_per_group > WARP_SIZE) {
|
if (num_experts_per_group > WARP_SIZE) {
|
||||||
for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
|
for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
|
||||||
T value = apply_scoring<SF>(input[i]);
|
T value = apply_scoring<SF>(input[i]);
|
||||||
value = value + static_cast<T>(bias[i]);
|
value = value + bias[i];
|
||||||
|
|
||||||
if (value > largest) {
|
if (value > largest) {
|
||||||
second_largest = largest;
|
second_largest = largest;
|
||||||
@@ -481,7 +477,7 @@ __device__ void topk_with_k2(T* output, T const* input, BiasT const* bias,
|
|||||||
} else {
|
} else {
|
||||||
for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
|
for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
|
||||||
T value = apply_scoring<SF>(input[i]);
|
T value = apply_scoring<SF>(input[i]);
|
||||||
value = value + static_cast<T>(bias[i]);
|
value = value + bias[i];
|
||||||
largest = value;
|
largest = value;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -503,8 +499,8 @@ __device__ void topk_with_k2(T* output, T const* input, BiasT const* bias,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T, typename BiasT, ScoringFunc SF>
|
template <typename T, ScoringFunc SF>
|
||||||
__global__ void topk_with_k2_kernel(T* output, T* input, BiasT const* bias,
|
__global__ void topk_with_k2_kernel(T* output, T* input, T const* bias,
|
||||||
int64_t const num_tokens,
|
int64_t const num_tokens,
|
||||||
int64_t const num_cases,
|
int64_t const num_cases,
|
||||||
int64_t const n_group,
|
int64_t const n_group,
|
||||||
@@ -517,7 +513,7 @@ __global__ void topk_with_k2_kernel(T* output, T* input, BiasT const* bias,
|
|||||||
input += case_id * num_experts_per_group;
|
input += case_id * num_experts_per_group;
|
||||||
// bias is per expert group, offset to current group
|
// bias is per expert group, offset to current group
|
||||||
int32_t group_id = case_id % n_group;
|
int32_t group_id = case_id % n_group;
|
||||||
BiasT const* group_bias = bias + group_id * num_experts_per_group;
|
T const* group_bias = bias + group_id * num_experts_per_group;
|
||||||
output += case_id;
|
output += case_id;
|
||||||
|
|
||||||
cg::thread_block block = cg::this_thread_block();
|
cg::thread_block block = cg::this_thread_block();
|
||||||
@@ -526,19 +522,18 @@ __global__ void topk_with_k2_kernel(T* output, T* input, BiasT const* bias,
|
|||||||
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
|
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
|
||||||
asm volatile("griddepcontrol.wait;");
|
asm volatile("griddepcontrol.wait;");
|
||||||
#endif
|
#endif
|
||||||
topk_with_k2<T, BiasT, SF>(output, input, group_bias, tile, lane_id,
|
topk_with_k2<T, SF>(output, input, group_bias, tile, lane_id,
|
||||||
num_experts_per_group);
|
num_experts_per_group);
|
||||||
}
|
}
|
||||||
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
|
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
|
||||||
asm volatile("griddepcontrol.launch_dependents;");
|
asm volatile("griddepcontrol.launch_dependents;");
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T, typename BiasT, typename IdxT, ScoringFunc SF,
|
template <typename T, typename IdxT, ScoringFunc SF, int NGroup = -1>
|
||||||
int NGroup = -1>
|
|
||||||
__global__ void group_idx_and_topk_idx_kernel(
|
__global__ void group_idx_and_topk_idx_kernel(
|
||||||
T* scores, T const* group_scores, float* topk_values, IdxT* topk_indices,
|
T* scores, T const* group_scores, float* topk_values, IdxT* topk_indices,
|
||||||
BiasT const* bias, int64_t const num_tokens, int64_t const n_group,
|
T const* bias, int64_t const num_tokens, int64_t const n_group,
|
||||||
int64_t const topk_group, int64_t const topk, int64_t const num_experts,
|
int64_t const topk_group, int64_t const topk, int64_t const num_experts,
|
||||||
int64_t const num_experts_per_group, bool renormalize,
|
int64_t const num_experts_per_group, bool renormalize,
|
||||||
double routed_scaling_factor) {
|
double routed_scaling_factor) {
|
||||||
@@ -624,7 +619,7 @@ __global__ void group_idx_and_topk_idx_kernel(
|
|||||||
T input = scores[offset + i];
|
T input = scores[offset + i];
|
||||||
if (is_finite(input)) {
|
if (is_finite(input)) {
|
||||||
T score = apply_scoring<SF>(input);
|
T score = apply_scoring<SF>(input);
|
||||||
candidates = score + static_cast<T>(bias[offset + i]);
|
candidates = score + bias[offset + i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
queue.add(candidates, offset + i);
|
queue.add(candidates, offset + i);
|
||||||
@@ -675,13 +670,10 @@ __global__ void group_idx_and_topk_idx_kernel(
|
|||||||
|
|
||||||
if (case_id < num_tokens) {
|
if (case_id < num_tokens) {
|
||||||
if (if_proceed_next_topk) {
|
if (if_proceed_next_topk) {
|
||||||
float scale = routed_scaling_factor;
|
|
||||||
if (renormalize) {
|
|
||||||
scale /= topk_sum;
|
|
||||||
}
|
|
||||||
for (int i = lane_id; i < topk; i += WARP_SIZE) {
|
for (int i = lane_id; i < topk; i += WARP_SIZE) {
|
||||||
float base = cuda_cast<float, T>(s_topk_value[i]);
|
float base = cuda_cast<float, T>(s_topk_value[i]);
|
||||||
float value = base * scale;
|
float value = renormalize ? (base / topk_sum * routed_scaling_factor)
|
||||||
|
: (base * routed_scaling_factor);
|
||||||
topk_indices[i] = s_topk_idx[i];
|
topk_indices[i] = s_topk_idx[i];
|
||||||
topk_values[i] = value;
|
topk_values[i] = value;
|
||||||
}
|
}
|
||||||
@@ -699,10 +691,10 @@ __global__ void group_idx_and_topk_idx_kernel(
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T, typename BiasT, typename IdxT, ScoringFunc SF>
|
template <typename T, typename IdxT, ScoringFunc SF>
|
||||||
inline void launch_group_idx_and_topk_kernel(
|
inline void launch_group_idx_and_topk_kernel(
|
||||||
cudaLaunchConfig_t const& config, T* scores, T* group_scores,
|
cudaLaunchConfig_t const& config, T* scores, T* group_scores,
|
||||||
float* topk_values, IdxT* topk_indices, BiasT const* bias,
|
float* topk_values, IdxT* topk_indices, T const* bias,
|
||||||
int64_t const num_tokens, int64_t const n_group, int64_t const topk_group,
|
int64_t const num_tokens, int64_t const n_group, int64_t const topk_group,
|
||||||
int64_t const topk, int64_t const num_experts,
|
int64_t const topk, int64_t const num_experts,
|
||||||
int64_t const num_experts_per_group, bool const renormalize,
|
int64_t const num_experts_per_group, bool const renormalize,
|
||||||
@@ -716,36 +708,36 @@ inline void launch_group_idx_and_topk_kernel(
|
|||||||
|
|
||||||
switch (n_group) {
|
switch (n_group) {
|
||||||
case 4: {
|
case 4: {
|
||||||
launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF, 4>);
|
launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 4>);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case 8: {
|
case 8: {
|
||||||
launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF, 8>);
|
launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 8>);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case 16: {
|
case 16: {
|
||||||
launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF, 16>);
|
launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 16>);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case 32: {
|
case 32: {
|
||||||
launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF, 32>);
|
launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 32>);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
default: {
|
default: {
|
||||||
launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF>);
|
launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF>);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T, typename BiasT, typename IdxT>
|
template <typename T, typename IdxT>
|
||||||
void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
|
void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
|
||||||
IdxT* topk_indices, BiasT const* bias,
|
IdxT* topk_indices, T const* bias, int64_t const num_tokens,
|
||||||
int64_t const num_tokens, int64_t const num_experts,
|
int64_t const num_experts, int64_t const n_group,
|
||||||
int64_t const n_group, int64_t const topk_group,
|
int64_t const topk_group, int64_t const topk,
|
||||||
int64_t const topk, bool const renormalize,
|
bool const renormalize, double const routed_scaling_factor,
|
||||||
double const routed_scaling_factor, int const scoring_func,
|
int const scoring_func, bool enable_pdl = false,
|
||||||
bool enable_pdl = false, cudaStream_t const stream = 0) {
|
cudaStream_t const stream = 0) {
|
||||||
int64_t num_cases = num_tokens * n_group;
|
int64_t num_cases = num_tokens * n_group;
|
||||||
int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;
|
int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;
|
||||||
cudaLaunchConfig_t config;
|
cudaLaunchConfig_t config;
|
||||||
@@ -766,12 +758,12 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
|
|||||||
};
|
};
|
||||||
switch (sf) {
|
switch (sf) {
|
||||||
case SCORING_NONE: {
|
case SCORING_NONE: {
|
||||||
auto* kernel_instance1 = &topk_with_k2_kernel<T, BiasT, SCORING_NONE>;
|
auto* kernel_instance1 = &topk_with_k2_kernel<T, SCORING_NONE>;
|
||||||
launch_topk_with_k2(kernel_instance1);
|
launch_topk_with_k2(kernel_instance1);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case SCORING_SIGMOID: {
|
case SCORING_SIGMOID: {
|
||||||
auto* kernel_instance1 = &topk_with_k2_kernel<T, BiasT, SCORING_SIGMOID>;
|
auto* kernel_instance1 = &topk_with_k2_kernel<T, SCORING_SIGMOID>;
|
||||||
launch_topk_with_k2(kernel_instance1);
|
launch_topk_with_k2(kernel_instance1);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -795,14 +787,14 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
|
|||||||
config.attrs = attrs;
|
config.attrs = attrs;
|
||||||
switch (sf) {
|
switch (sf) {
|
||||||
case SCORING_NONE: {
|
case SCORING_NONE: {
|
||||||
launch_group_idx_and_topk_kernel<T, BiasT, IdxT, SCORING_NONE>(
|
launch_group_idx_and_topk_kernel<T, IdxT, SCORING_NONE>(
|
||||||
config, scores, group_scores, topk_values, topk_indices, bias,
|
config, scores, group_scores, topk_values, topk_indices, bias,
|
||||||
num_tokens, n_group, topk_group, topk, num_experts,
|
num_tokens, n_group, topk_group, topk, num_experts,
|
||||||
num_experts_per_group, renormalize, routed_scaling_factor);
|
num_experts_per_group, renormalize, routed_scaling_factor);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case SCORING_SIGMOID: {
|
case SCORING_SIGMOID: {
|
||||||
launch_group_idx_and_topk_kernel<T, BiasT, IdxT, SCORING_SIGMOID>(
|
launch_group_idx_and_topk_kernel<T, IdxT, SCORING_SIGMOID>(
|
||||||
config, scores, group_scores, topk_values, topk_indices, bias,
|
config, scores, group_scores, topk_values, topk_indices, bias,
|
||||||
num_tokens, n_group, topk_group, topk, num_experts,
|
num_tokens, n_group, topk_group, topk, num_experts,
|
||||||
num_experts_per_group, renormalize, routed_scaling_factor);
|
num_experts_per_group, renormalize, routed_scaling_factor);
|
||||||
@@ -813,23 +805,17 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#define INSTANTIATE_NOAUX_TC(T, BiasT, IdxT) \
|
#define INSTANTIATE_NOAUX_TC(T, IdxT) \
|
||||||
template void invokeNoAuxTc<T, BiasT, IdxT>( \
|
template void invokeNoAuxTc<T, IdxT>( \
|
||||||
T * scores, T * group_scores, float* topk_values, IdxT* topk_indices, \
|
T * scores, T * group_scores, float* topk_values, IdxT* topk_indices, \
|
||||||
BiasT const* bias, int64_t const num_tokens, int64_t const num_experts, \
|
T const* bias, int64_t const num_tokens, int64_t const num_experts, \
|
||||||
int64_t const n_group, int64_t const topk_group, int64_t const topk, \
|
int64_t const n_group, int64_t const topk_group, int64_t const topk, \
|
||||||
bool const renormalize, double const routed_scaling_factor, \
|
bool const renormalize, double const routed_scaling_factor, \
|
||||||
int const scoring_func, bool enable_pdl, cudaStream_t const stream);
|
int const scoring_func, bool enable_pdl, cudaStream_t const stream);
|
||||||
|
|
||||||
INSTANTIATE_NOAUX_TC(float, float, int32_t);
|
INSTANTIATE_NOAUX_TC(float, int32_t);
|
||||||
INSTANTIATE_NOAUX_TC(float, half, int32_t);
|
INSTANTIATE_NOAUX_TC(half, int32_t);
|
||||||
INSTANTIATE_NOAUX_TC(float, __nv_bfloat16, int32_t);
|
INSTANTIATE_NOAUX_TC(__nv_bfloat16, int32_t);
|
||||||
INSTANTIATE_NOAUX_TC(half, float, int32_t);
|
|
||||||
INSTANTIATE_NOAUX_TC(half, half, int32_t);
|
|
||||||
INSTANTIATE_NOAUX_TC(half, __nv_bfloat16, int32_t);
|
|
||||||
INSTANTIATE_NOAUX_TC(__nv_bfloat16, float, int32_t);
|
|
||||||
INSTANTIATE_NOAUX_TC(__nv_bfloat16, half, int32_t);
|
|
||||||
INSTANTIATE_NOAUX_TC(__nv_bfloat16, __nv_bfloat16, int32_t);
|
|
||||||
} // end namespace moe
|
} // end namespace moe
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|
||||||
@@ -838,7 +824,6 @@ std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
|
|||||||
int64_t topk, bool renormalize, double routed_scaling_factor,
|
int64_t topk, bool renormalize, double routed_scaling_factor,
|
||||||
torch::Tensor const& bias, int64_t scoring_func = 0) {
|
torch::Tensor const& bias, int64_t scoring_func = 0) {
|
||||||
auto data_type = scores.scalar_type();
|
auto data_type = scores.scalar_type();
|
||||||
auto bias_type = bias.scalar_type();
|
|
||||||
auto input_size = scores.sizes();
|
auto input_size = scores.sizes();
|
||||||
int64_t num_tokens = input_size[0];
|
int64_t num_tokens = input_size[0];
|
||||||
int64_t num_experts = input_size[1];
|
int64_t num_experts = input_size[1];
|
||||||
@@ -862,62 +847,39 @@ std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
|
|||||||
|
|
||||||
auto stream = c10::cuda::getCurrentCUDAStream(scores.get_device());
|
auto stream = c10::cuda::getCurrentCUDAStream(scores.get_device());
|
||||||
|
|
||||||
#define LAUNCH_KERNEL(T, IdxT) \
|
|
||||||
do { \
|
|
||||||
switch (bias_type) { \
|
|
||||||
case torch::kFloat16: \
|
|
||||||
vllm::moe::invokeNoAuxTc<T, half, IdxT>( \
|
|
||||||
reinterpret_cast<T*>(scores.mutable_data_ptr()), \
|
|
||||||
reinterpret_cast<T*>(group_scores.mutable_data_ptr()), \
|
|
||||||
reinterpret_cast<float*>(topk_values.mutable_data_ptr()), \
|
|
||||||
reinterpret_cast<IdxT*>(topk_indices.mutable_data_ptr()), \
|
|
||||||
reinterpret_cast<half const*>(bias.data_ptr()), num_tokens, \
|
|
||||||
num_experts, n_group, topk_group, topk, renormalize, \
|
|
||||||
routed_scaling_factor, static_cast<int>(scoring_func), false, \
|
|
||||||
stream); \
|
|
||||||
break; \
|
|
||||||
case torch::kFloat32: \
|
|
||||||
vllm::moe::invokeNoAuxTc<T, float, IdxT>( \
|
|
||||||
reinterpret_cast<T*>(scores.mutable_data_ptr()), \
|
|
||||||
reinterpret_cast<T*>(group_scores.mutable_data_ptr()), \
|
|
||||||
reinterpret_cast<float*>(topk_values.mutable_data_ptr()), \
|
|
||||||
reinterpret_cast<IdxT*>(topk_indices.mutable_data_ptr()), \
|
|
||||||
reinterpret_cast<float const*>(bias.data_ptr()), num_tokens, \
|
|
||||||
num_experts, n_group, topk_group, topk, renormalize, \
|
|
||||||
routed_scaling_factor, static_cast<int>(scoring_func), false, \
|
|
||||||
stream); \
|
|
||||||
break; \
|
|
||||||
case torch::kBFloat16: \
|
|
||||||
vllm::moe::invokeNoAuxTc<T, __nv_bfloat16, IdxT>( \
|
|
||||||
reinterpret_cast<T*>(scores.mutable_data_ptr()), \
|
|
||||||
reinterpret_cast<T*>(group_scores.mutable_data_ptr()), \
|
|
||||||
reinterpret_cast<float*>(topk_values.mutable_data_ptr()), \
|
|
||||||
reinterpret_cast<IdxT*>(topk_indices.mutable_data_ptr()), \
|
|
||||||
reinterpret_cast<__nv_bfloat16 const*>(bias.data_ptr()), \
|
|
||||||
num_tokens, num_experts, n_group, topk_group, topk, renormalize, \
|
|
||||||
routed_scaling_factor, static_cast<int>(scoring_func), false, \
|
|
||||||
stream); \
|
|
||||||
break; \
|
|
||||||
default: \
|
|
||||||
throw std::invalid_argument( \
|
|
||||||
"Invalid bias dtype, only supports float16, float32, and " \
|
|
||||||
"bfloat16"); \
|
|
||||||
break; \
|
|
||||||
} \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
switch (data_type) {
|
switch (data_type) {
|
||||||
case torch::kFloat16:
|
case torch::kFloat16:
|
||||||
// Handle Float16
|
// Handle Float16
|
||||||
LAUNCH_KERNEL(half, int32_t);
|
vllm::moe::invokeNoAuxTc<half, int32_t>(
|
||||||
|
reinterpret_cast<half*>(scores.mutable_data_ptr()),
|
||||||
|
reinterpret_cast<half*>(group_scores.mutable_data_ptr()),
|
||||||
|
reinterpret_cast<float*>(topk_values.mutable_data_ptr()),
|
||||||
|
reinterpret_cast<int32_t*>(topk_indices.mutable_data_ptr()),
|
||||||
|
reinterpret_cast<half const*>(bias.data_ptr()), num_tokens,
|
||||||
|
num_experts, n_group, topk_group, topk, renormalize,
|
||||||
|
routed_scaling_factor, static_cast<int>(scoring_func), false, stream);
|
||||||
break;
|
break;
|
||||||
case torch::kFloat32:
|
case torch::kFloat32:
|
||||||
// Handle Float32
|
// Handle Float32
|
||||||
LAUNCH_KERNEL(float, int32_t);
|
vllm::moe::invokeNoAuxTc<float, int32_t>(
|
||||||
|
reinterpret_cast<float*>(scores.mutable_data_ptr()),
|
||||||
|
reinterpret_cast<float*>(group_scores.mutable_data_ptr()),
|
||||||
|
reinterpret_cast<float*>(topk_values.mutable_data_ptr()),
|
||||||
|
reinterpret_cast<int32_t*>(topk_indices.mutable_data_ptr()),
|
||||||
|
reinterpret_cast<float const*>(bias.data_ptr()), num_tokens,
|
||||||
|
num_experts, n_group, topk_group, topk, renormalize,
|
||||||
|
routed_scaling_factor, static_cast<int>(scoring_func), false, stream);
|
||||||
break;
|
break;
|
||||||
case torch::kBFloat16:
|
case torch::kBFloat16:
|
||||||
// Handle BFloat16
|
// Handle BFloat16
|
||||||
LAUNCH_KERNEL(__nv_bfloat16, int32_t);
|
vllm::moe::invokeNoAuxTc<__nv_bfloat16, int32_t>(
|
||||||
|
reinterpret_cast<__nv_bfloat16*>(scores.mutable_data_ptr()),
|
||||||
|
reinterpret_cast<__nv_bfloat16*>(group_scores.mutable_data_ptr()),
|
||||||
|
reinterpret_cast<float*>(topk_values.mutable_data_ptr()),
|
||||||
|
reinterpret_cast<int32_t*>(topk_indices.mutable_data_ptr()),
|
||||||
|
reinterpret_cast<__nv_bfloat16 const*>(bias.data_ptr()), num_tokens,
|
||||||
|
num_experts, n_group, topk_group, topk, renormalize,
|
||||||
|
routed_scaling_factor, static_cast<int>(scoring_func), false, stream);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
// Handle other data types
|
// Handle other data types
|
||||||
@@ -925,6 +887,5 @@ std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
|
|||||||
"Invalid dtype, only supports float16, float32, and bfloat16");
|
"Invalid dtype, only supports float16, float32, and bfloat16");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
#undef LAUNCH_KERNEL
|
|
||||||
return {topk_values, topk_indices};
|
return {topk_values, topk_indices};
|
||||||
}
|
}
|
||||||
|
|||||||
1
csrc/moe/marlin_moe_wna16/.gitignore
vendored
1
csrc/moe/marlin_moe_wna16/.gitignore
vendored
@@ -1,3 +1,2 @@
|
|||||||
sm*_kernel_*.cu
|
sm*_kernel_*.cu
|
||||||
kernel_selector.h
|
kernel_selector.h
|
||||||
kernel_*.cu
|
|
||||||
|
|||||||
@@ -10,8 +10,6 @@ import jinja2
|
|||||||
|
|
||||||
ARCHS = []
|
ARCHS = []
|
||||||
SUPPORT_FP8 = False
|
SUPPORT_FP8 = False
|
||||||
SUPPORT_SM75 = False
|
|
||||||
SUPPORT_SM80 = False
|
|
||||||
for arch in sys.argv[1].split(","):
|
for arch in sys.argv[1].split(","):
|
||||||
arch = arch[: arch.index(".") + 2].replace(".", "")
|
arch = arch[: arch.index(".") + 2].replace(".", "")
|
||||||
arch = int(arch)
|
arch = int(arch)
|
||||||
@@ -21,10 +19,6 @@ for arch in sys.argv[1].split(","):
|
|||||||
# with FP16 MMA, so it cannot achieve any acceleration.
|
# with FP16 MMA, so it cannot achieve any acceleration.
|
||||||
if arch in [89, 120]:
|
if arch in [89, 120]:
|
||||||
SUPPORT_FP8 = True
|
SUPPORT_FP8 = True
|
||||||
if arch >= 80:
|
|
||||||
SUPPORT_SM80 = True
|
|
||||||
if arch == 75:
|
|
||||||
SUPPORT_SM75 = True
|
|
||||||
|
|
||||||
FILE_HEAD_COMMENT = """
|
FILE_HEAD_COMMENT = """
|
||||||
// auto generated by generate_kernels.py
|
// auto generated by generate_kernels.py
|
||||||
@@ -163,7 +157,6 @@ def remove_old_kernels():
|
|||||||
|
|
||||||
def generate_new_kernels():
|
def generate_new_kernels():
|
||||||
result_dict = {}
|
result_dict = {}
|
||||||
sm_75_result_dict = {}
|
|
||||||
|
|
||||||
for quant_config in QUANT_CONFIGS:
|
for quant_config in QUANT_CONFIGS:
|
||||||
c_types = quant_config.get("c_type", ["kFloat16", "kBFloat16"])
|
c_types = quant_config.get("c_type", ["kFloat16", "kBFloat16"])
|
||||||
@@ -181,8 +174,6 @@ def generate_new_kernels():
|
|||||||
s_type = quant_config.get("s_type", c_type)
|
s_type = quant_config.get("s_type", c_type)
|
||||||
if (a_type, b_type, c_type) not in result_dict:
|
if (a_type, b_type, c_type) not in result_dict:
|
||||||
result_dict[(a_type, b_type, c_type)] = []
|
result_dict[(a_type, b_type, c_type)] = []
|
||||||
if a_type in ["kFloat16", "kS8"] and c_type == "kFloat16":
|
|
||||||
sm_75_result_dict[(a_type, b_type, c_type)] = []
|
|
||||||
|
|
||||||
for group_blocks, m_blocks, thread_configs in itertools.product(
|
for group_blocks, m_blocks, thread_configs in itertools.product(
|
||||||
all_group_blocks, all_m_blocks, all_thread_configs
|
all_group_blocks, all_m_blocks, all_thread_configs
|
||||||
@@ -206,89 +197,78 @@ def generate_new_kernels():
|
|||||||
"thread_k_blocks": thread_k // 16,
|
"thread_k_blocks": thread_k // 16,
|
||||||
"thread_n_blocks": thread_n // 16,
|
"thread_n_blocks": thread_n // 16,
|
||||||
"m_block_size_8": "true" if m_blocks == 0.5 else "false",
|
"m_block_size_8": "true" if m_blocks == 0.5 else "false",
|
||||||
"stages": 4,
|
"stages": "pipe_stages",
|
||||||
"group_blocks": group_blocks,
|
"group_blocks": group_blocks,
|
||||||
"is_zp_float": "false",
|
"is_zp_float": "false",
|
||||||
}
|
}
|
||||||
|
|
||||||
if SUPPORT_SM80:
|
result_dict[(a_type, b_type, c_type)].append(config)
|
||||||
result_dict[(a_type, b_type, c_type)].append(config)
|
|
||||||
if (a_type, b_type, c_type) in sm_75_result_dict and SUPPORT_SM75:
|
|
||||||
config_sm75 = config.copy()
|
|
||||||
config_sm75["stages"] = 2
|
|
||||||
sm_75_result_dict[(a_type, b_type, c_type)].append(config_sm75)
|
|
||||||
|
|
||||||
kernel_selector_str = FILE_HEAD_COMMENT
|
kernel_selector_str = FILE_HEAD_COMMENT
|
||||||
|
|
||||||
for result_dict_tmp in [result_dict, sm_75_result_dict]:
|
for (a_type, b_type, c_type), config_list in result_dict.items():
|
||||||
for (a_type, b_type, c_type), config_list in result_dict_tmp.items():
|
all_template_str_list = []
|
||||||
all_template_str_list = []
|
for config in config_list:
|
||||||
if not config_list:
|
s_type = config["s_type"]
|
||||||
continue
|
template_str = jinja2.Template(TEMPLATE).render(
|
||||||
for config in config_list:
|
a_type_id=f"vllm::{a_type}.id()",
|
||||||
s_type = config["s_type"]
|
b_type_id=f"vllm::{b_type}.id()",
|
||||||
template_str = jinja2.Template(TEMPLATE).render(
|
c_type_id=f"vllm::{c_type}.id()",
|
||||||
|
s_type_id=f"vllm::{s_type}.id()",
|
||||||
|
**config,
|
||||||
|
)
|
||||||
|
all_template_str_list.append(template_str)
|
||||||
|
|
||||||
|
conditions = [
|
||||||
|
f"a_type == vllm::{a_type}",
|
||||||
|
f"b_type == vllm::{b_type}",
|
||||||
|
f"c_type == vllm::{c_type}",
|
||||||
|
f"s_type == vllm::{s_type}",
|
||||||
|
f"threads == {config['threads']}",
|
||||||
|
f"thread_m_blocks == {config['thread_m_blocks']}",
|
||||||
|
f"thread_n_blocks == {config['thread_n_blocks']}",
|
||||||
|
f"thread_k_blocks == {config['thread_k_blocks']}",
|
||||||
|
f"m_block_size_8 == {config['m_block_size_8']}",
|
||||||
|
f"group_blocks == {config['group_blocks']}",
|
||||||
|
f"is_zp_float == {config['is_zp_float']}",
|
||||||
|
]
|
||||||
|
conditions = " && ".join(conditions)
|
||||||
|
|
||||||
|
if kernel_selector_str == FILE_HEAD_COMMENT:
|
||||||
|
kernel_selector_str += f"if ({conditions})\n kernel = "
|
||||||
|
else:
|
||||||
|
kernel_selector_str += f"else if ({conditions})\n kernel = "
|
||||||
|
|
||||||
|
kernel_template2 = (
|
||||||
|
"Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, "
|
||||||
|
"{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, "
|
||||||
|
"{{thread_n_blocks}}, {{thread_k_blocks}}, "
|
||||||
|
"{{m_block_size_8}}, {{stages}}, {{group_blocks}}, "
|
||||||
|
"{{is_zp_float}}>;"
|
||||||
|
)
|
||||||
|
|
||||||
|
kernel_selector_str += (
|
||||||
|
jinja2.Template(kernel_template2).render(
|
||||||
a_type_id=f"vllm::{a_type}.id()",
|
a_type_id=f"vllm::{a_type}.id()",
|
||||||
b_type_id=f"vllm::{b_type}.id()",
|
b_type_id=f"vllm::{b_type}.id()",
|
||||||
c_type_id=f"vllm::{c_type}.id()",
|
c_type_id=f"vllm::{c_type}.id()",
|
||||||
s_type_id=f"vllm::{s_type}.id()",
|
s_type_id=f"vllm::{s_type}.id()",
|
||||||
**config,
|
**config,
|
||||||
)
|
)
|
||||||
all_template_str_list.append(template_str)
|
+ "\n"
|
||||||
|
)
|
||||||
|
|
||||||
conditions = [
|
file_content = FILE_HEAD + "\n\n"
|
||||||
f"a_type == vllm::{a_type}",
|
file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
|
||||||
f"b_type == vllm::{b_type}",
|
if a_type == "kFE4M3fn":
|
||||||
f"c_type == vllm::{c_type}",
|
filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
|
||||||
f"s_type == vllm::{s_type}",
|
else:
|
||||||
f"threads == {config['threads']}",
|
filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
|
||||||
f"thread_m_blocks == {config['thread_m_blocks']}",
|
|
||||||
f"thread_n_blocks == {config['thread_n_blocks']}",
|
|
||||||
f"thread_k_blocks == {config['thread_k_blocks']}",
|
|
||||||
f"m_block_size_8 == {config['m_block_size_8']}",
|
|
||||||
f"stages == {config['stages']}",
|
|
||||||
f"group_blocks == {config['group_blocks']}",
|
|
||||||
f"is_zp_float == {config['is_zp_float']}",
|
|
||||||
]
|
|
||||||
conditions = " && ".join(conditions)
|
|
||||||
|
|
||||||
if kernel_selector_str == FILE_HEAD_COMMENT:
|
filename = filename.lower()
|
||||||
kernel_selector_str += f"if ({conditions})\n kernel = "
|
|
||||||
else:
|
|
||||||
kernel_selector_str += f"else if ({conditions})\n kernel = "
|
|
||||||
|
|
||||||
kernel_template2 = (
|
with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
|
||||||
"Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, "
|
f.write(file_content)
|
||||||
"{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, "
|
|
||||||
"{{thread_n_blocks}}, {{thread_k_blocks}}, "
|
|
||||||
"{{m_block_size_8}}, {{stages}}, {{group_blocks}}, "
|
|
||||||
"{{is_zp_float}}>;"
|
|
||||||
)
|
|
||||||
|
|
||||||
kernel_selector_str += (
|
|
||||||
jinja2.Template(kernel_template2).render(
|
|
||||||
a_type_id=f"vllm::{a_type}.id()",
|
|
||||||
b_type_id=f"vllm::{b_type}.id()",
|
|
||||||
c_type_id=f"vllm::{c_type}.id()",
|
|
||||||
s_type_id=f"vllm::{s_type}.id()",
|
|
||||||
**config,
|
|
||||||
)
|
|
||||||
+ "\n"
|
|
||||||
)
|
|
||||||
|
|
||||||
file_content = FILE_HEAD + "\n\n"
|
|
||||||
file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
|
|
||||||
if a_type == "kFE4M3fn":
|
|
||||||
filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
|
|
||||||
elif result_dict_tmp is sm_75_result_dict:
|
|
||||||
filename = f"sm75_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
|
|
||||||
else:
|
|
||||||
filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
|
|
||||||
|
|
||||||
filename = filename.lower()
|
|
||||||
|
|
||||||
with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
|
|
||||||
f.write(file_content)
|
|
||||||
|
|
||||||
if not SUPPORT_FP8 and kernel_selector_str != FILE_HEAD_COMMENT:
|
if not SUPPORT_FP8 and kernel_selector_str != FILE_HEAD_COMMENT:
|
||||||
kernel_selector_str += (
|
kernel_selector_str += (
|
||||||
|
|||||||
@@ -7,20 +7,20 @@
|
|||||||
#include "quantization/gptq_marlin/marlin_dtypes.cuh"
|
#include "quantization/gptq_marlin/marlin_dtypes.cuh"
|
||||||
#include "core/scalar_type.hpp"
|
#include "core/scalar_type.hpp"
|
||||||
|
|
||||||
#define MARLIN_KERNEL_PARAMS \
|
#define MARLIN_KERNEL_PARAMS \
|
||||||
const int4 *__restrict__ A, const int4 *__restrict__ B, \
|
const int4 *__restrict__ A, const int4 *__restrict__ B, \
|
||||||
int4 *__restrict__ C, int4 *__restrict__ C_tmp, \
|
int4 *__restrict__ C, int4 *__restrict__ C_tmp, \
|
||||||
const int4 *__restrict__ b_bias_ptr, \
|
const int4 *__restrict__ b_bias_ptr, \
|
||||||
const float *__restrict__ a_scales_ptr, \
|
const float *__restrict__ a_scales_ptr, \
|
||||||
const int4 *__restrict__ scales_ptr, \
|
const int4 *__restrict__ scales_ptr, \
|
||||||
const uint16_t *__restrict__ global_scale_ptr, \
|
const uint16_t *__restrict__ global_scale_ptr, \
|
||||||
const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx, \
|
const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx, \
|
||||||
const int32_t *__restrict__ sorted_token_ids_ptr, \
|
const int32_t *__restrict__ sorted_token_ids_ptr, \
|
||||||
const int32_t *__restrict__ expert_ids_ptr, \
|
const int32_t *__restrict__ expert_ids_ptr, \
|
||||||
const int32_t *__restrict__ num_tokens_past_padded_ptr, \
|
const int32_t *__restrict__ num_tokens_past_padded_ptr, \
|
||||||
const float *__restrict__ topk_weights_ptr, int top_k, \
|
const float *__restrict__ topk_weights_ptr, int top_k, \
|
||||||
bool mul_topk_weights, int num_groups, int prob_m, int prob_n, \
|
bool mul_topk_weights, bool is_ep, int num_groups, int prob_m, \
|
||||||
int prob_k, int *locks, bool has_bias, bool use_atomic_add, \
|
int prob_n, int prob_k, int *locks, bool has_bias, bool use_atomic_add, \
|
||||||
bool use_fp32_reduce
|
bool use_fp32_reduce
|
||||||
|
|
||||||
namespace MARLIN_NAMESPACE_NAME {
|
namespace MARLIN_NAMESPACE_NAME {
|
||||||
|
|||||||
@@ -26,7 +26,6 @@
|
|||||||
#include "quantization/gptq_marlin/marlin.cuh"
|
#include "quantization/gptq_marlin/marlin.cuh"
|
||||||
#include "quantization/gptq_marlin/marlin_dtypes.cuh"
|
#include "quantization/gptq_marlin/marlin_dtypes.cuh"
|
||||||
#include "quantization/gptq_marlin/dequant.h"
|
#include "quantization/gptq_marlin/dequant.h"
|
||||||
#include "quantization/gptq_marlin/marlin_mma.h"
|
|
||||||
#include "core/scalar_type.hpp"
|
#include "core/scalar_type.hpp"
|
||||||
|
|
||||||
#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t) \
|
#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t) \
|
||||||
@@ -36,7 +35,7 @@
|
|||||||
|
|
||||||
namespace MARLIN_NAMESPACE_NAME {
|
namespace MARLIN_NAMESPACE_NAME {
|
||||||
|
|
||||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
|
||||||
|
|
||||||
template <typename scalar_t, // compute dtype, half or nv_float16
|
template <typename scalar_t, // compute dtype, half or nv_float16
|
||||||
const vllm::ScalarTypeId b_type_id, // weight MarlinScalarType id
|
const vllm::ScalarTypeId b_type_id, // weight MarlinScalarType id
|
||||||
@@ -71,6 +70,7 @@ __global__ void Marlin(
|
|||||||
const float* __restrict__ topk_weights_ptr, // moe top weights
|
const float* __restrict__ topk_weights_ptr, // moe top weights
|
||||||
int top_k, // num of experts per token
|
int top_k, // num of experts per token
|
||||||
bool mul_topk_weights, // mul topk weights or not
|
bool mul_topk_weights, // mul topk weights or not
|
||||||
|
bool is_ep, // expert parallelism
|
||||||
int num_groups, // number of scale groups per output channel
|
int num_groups, // number of scale groups per output channel
|
||||||
int prob_m, // batch dimension m
|
int prob_m, // batch dimension m
|
||||||
int prob_n, // output dimension n
|
int prob_n, // output dimension n
|
||||||
@@ -84,6 +84,146 @@ __global__ void Marlin(
|
|||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
|
||||||
|
// output/accumulation.
|
||||||
|
template <vllm::ScalarTypeId type_id, int k_size = 16>
|
||||||
|
__device__ inline void mma(
|
||||||
|
const typename MarlinScalarType<type_id>::FragA& a_frag,
|
||||||
|
const typename MarlinScalarType<type_id>::FragB& frag_b,
|
||||||
|
typename MarlinScalarType<type_id>::FragC& frag_c, int idx = 0) {
|
||||||
|
const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
|
||||||
|
const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
|
||||||
|
using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
|
||||||
|
if constexpr (k_size == 16) {
|
||||||
|
if constexpr (std::is_same<scalar_t, half>::value) {
|
||||||
|
float* c = reinterpret_cast<float*>(&frag_c);
|
||||||
|
asm volatile(
|
||||||
|
"mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
|
||||||
|
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
|
||||||
|
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
|
||||||
|
: "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
|
||||||
|
"f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
|
||||||
|
} else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
|
||||||
|
float* c = reinterpret_cast<float*>(&frag_c);
|
||||||
|
asm volatile(
|
||||||
|
"mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
|
||||||
|
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
|
||||||
|
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
|
||||||
|
: "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
|
||||||
|
"f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
|
||||||
|
} else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
|
||||||
|
float* c = reinterpret_cast<float*>(&frag_c);
|
||||||
|
asm volatile(
|
||||||
|
"mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
|
||||||
|
"{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
|
||||||
|
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
|
||||||
|
: "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "f"(c[0]),
|
||||||
|
"f"(c[1]), "f"(c[2]), "f"(c[3]));
|
||||||
|
} else if constexpr (std::is_same<scalar_t, int8_t>::value) {
|
||||||
|
int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
|
||||||
|
asm volatile(
|
||||||
|
"mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
|
||||||
|
"{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
|
||||||
|
: "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
|
||||||
|
: "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "r"(c[0]),
|
||||||
|
"r"(c[1]), "r"(c[2]), "r"(c[3]));
|
||||||
|
}
|
||||||
|
} else if (k_size == 32) {
|
||||||
|
if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
|
||||||
|
float* c = reinterpret_cast<float*>(&frag_c);
|
||||||
|
asm volatile(
|
||||||
|
"mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
|
||||||
|
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
|
||||||
|
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
|
||||||
|
: "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
|
||||||
|
"f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
|
||||||
|
} else if constexpr (std::is_same<scalar_t, int8_t>::value) {
|
||||||
|
int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
|
||||||
|
asm volatile(
|
||||||
|
"mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
|
||||||
|
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
|
||||||
|
: "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
|
||||||
|
: "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
|
||||||
|
"r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <vllm::ScalarTypeId type_id, int k_size = 16>
|
||||||
|
__device__ inline void mma_trans(
|
||||||
|
const typename MarlinScalarType<type_id>::FragA& a_frag,
|
||||||
|
const typename MarlinScalarType<type_id>::FragB& frag_b,
|
||||||
|
const typename MarlinScalarType<type_id>::FragB& frag_b2,
|
||||||
|
typename MarlinScalarType<type_id>::FragC& frag_c) {
|
||||||
|
const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
|
||||||
|
const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
|
||||||
|
const uint32_t* b2 = reinterpret_cast<const uint32_t*>(&frag_b2);
|
||||||
|
float* c = reinterpret_cast<float*>(&frag_c);
|
||||||
|
using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
|
||||||
|
if constexpr (k_size == 16) {
|
||||||
|
if constexpr (std::is_same<scalar_t, half>::value) {
|
||||||
|
float* c = reinterpret_cast<float*>(&frag_c);
|
||||||
|
asm volatile(
|
||||||
|
"mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
|
||||||
|
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
|
||||||
|
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
|
||||||
|
: "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
|
||||||
|
"f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
|
||||||
|
} else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
|
||||||
|
float* c = reinterpret_cast<float*>(&frag_c);
|
||||||
|
asm volatile(
|
||||||
|
"mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
|
||||||
|
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
|
||||||
|
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
|
||||||
|
: "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
|
||||||
|
"f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
|
||||||
|
} else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
|
||||||
|
float* c = reinterpret_cast<float*>(&frag_c);
|
||||||
|
asm volatile(
|
||||||
|
"mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
|
||||||
|
"{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
|
||||||
|
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
|
||||||
|
: "r"(b[0]), "r"(b2[0]), "r"(a[0]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
|
||||||
|
"f"(c[3]));
|
||||||
|
} else if constexpr (std::is_same<scalar_t, int8_t>::value) {
|
||||||
|
int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
|
||||||
|
asm volatile(
|
||||||
|
"mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
|
||||||
|
"{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
|
||||||
|
: "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
|
||||||
|
: "r"(b[0]), "r"(b2[0]), "r"(a[0]), "r"(c[0]), "r"(c[1]), "r"(c[2]),
|
||||||
|
"r"(c[3]));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
|
||||||
|
float* c = reinterpret_cast<float*>(&frag_c);
|
||||||
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 1200
|
||||||
|
asm volatile(
|
||||||
|
"mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
|
||||||
|
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
|
||||||
|
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
|
||||||
|
: "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
|
||||||
|
"f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
|
||||||
|
#else
|
||||||
|
asm volatile(
|
||||||
|
"mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
|
||||||
|
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
|
||||||
|
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
|
||||||
|
: "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
|
||||||
|
"f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
|
||||||
|
#endif
|
||||||
|
} else if constexpr (std::is_same<scalar_t, int8_t>::value) {
|
||||||
|
int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
|
||||||
|
asm volatile(
|
||||||
|
"mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
|
||||||
|
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
|
||||||
|
: "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
|
||||||
|
: "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
|
||||||
|
"r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Instruction for loading a full 16x16 matrix fragment of operand A from shared
|
// Instruction for loading a full 16x16 matrix fragment of operand A from shared
|
||||||
// memory, directly in tensor core layout.
|
// memory, directly in tensor core layout.
|
||||||
template <int count, vllm::ScalarTypeId type_id>
|
template <int count, vllm::ScalarTypeId type_id>
|
||||||
@@ -272,6 +412,7 @@ __global__ void Marlin(
|
|||||||
const float* __restrict__ topk_weights_ptr, // moe top weights
|
const float* __restrict__ topk_weights_ptr, // moe top weights
|
||||||
int top_k, // num of experts per token
|
int top_k, // num of experts per token
|
||||||
bool mul_topk_weights, // mul topk weights or not
|
bool mul_topk_weights, // mul topk weights or not
|
||||||
|
bool is_ep, // expert parallelism
|
||||||
int num_groups, // number of scale groups per output channel
|
int num_groups, // number of scale groups per output channel
|
||||||
int prob_m, // batch dimension m
|
int prob_m, // batch dimension m
|
||||||
int prob_n, // output dimension n
|
int prob_n, // output dimension n
|
||||||
@@ -298,20 +439,9 @@ __global__ void Marlin(
|
|||||||
if constexpr (a_type_id == vllm::kFE4M3fn.id()) return;
|
if constexpr (a_type_id == vllm::kFE4M3fn.id()) return;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
|
|
||||||
// Turing TensorCore only supports fp16 and int8
|
|
||||||
if constexpr (a_type_id != vllm::kFloat16.id() && a_type_id != vllm::kS8.id())
|
|
||||||
return;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int num_tokens_past_padded = num_tokens_past_padded_ptr[0];
|
int num_tokens_past_padded = num_tokens_past_padded_ptr[0];
|
||||||
constexpr int moe_block_size = m_block_size_8 ? 8 : (16 * thread_m_blocks);
|
constexpr int moe_block_size = m_block_size_8 ? 8 : (16 * thread_m_blocks);
|
||||||
|
|
||||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
|
|
||||||
constexpr bool use_fp16_accum = a_type_id == vllm::kFloat16.id();
|
|
||||||
#else
|
|
||||||
constexpr bool use_fp16_accum = false;
|
|
||||||
#endif
|
|
||||||
using Adtype = MarlinScalarType<a_type_id>;
|
using Adtype = MarlinScalarType<a_type_id>;
|
||||||
using Cdtype = MarlinScalarType<c_type_id>;
|
using Cdtype = MarlinScalarType<c_type_id>;
|
||||||
|
|
||||||
@@ -374,6 +504,14 @@ __global__ void Marlin(
|
|||||||
|
|
||||||
// parallel: num valid moe blocks
|
// parallel: num valid moe blocks
|
||||||
int parallel = num_tokens_past_padded / moe_block_size;
|
int parallel = num_tokens_past_padded / moe_block_size;
|
||||||
|
int num_valid_blocks = parallel;
|
||||||
|
if (is_ep) {
|
||||||
|
for (int i = 0; i < parallel; i++) {
|
||||||
|
if (expert_ids_ptr[i] == -1) num_valid_blocks--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int num_invalid_blocks = parallel - num_valid_blocks;
|
||||||
|
parallel = num_valid_blocks;
|
||||||
|
|
||||||
int k_tiles = prob_k / 16 / thread_k_blocks;
|
int k_tiles = prob_k / 16 / thread_k_blocks;
|
||||||
int n_tiles = prob_n / 16 / thread_n_blocks;
|
int n_tiles = prob_n / 16 / thread_n_blocks;
|
||||||
@@ -480,22 +618,7 @@ __global__ void Marlin(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
|
|
||||||
|
|
||||||
if constexpr (moe_block_size >= 16)
|
|
||||||
local_count += __shfl_down_sync(0xFFFFFFFF, local_count, 16);
|
|
||||||
if constexpr (moe_block_size >= 8)
|
|
||||||
local_count += __shfl_down_sync(0xFFFFFFFF, local_count, 8);
|
|
||||||
if constexpr (moe_block_size >= 4)
|
|
||||||
local_count += __shfl_down_sync(0xFFFFFFFF, local_count, 4);
|
|
||||||
if constexpr (moe_block_size >= 2)
|
|
||||||
local_count += __shfl_down_sync(0xFFFFFFFF, local_count, 2);
|
|
||||||
|
|
||||||
local_count += __shfl_down_sync(0xFFFFFFFF, local_count, 1);
|
|
||||||
block_num_valid_tokens = local_count;
|
|
||||||
#else
|
|
||||||
block_num_valid_tokens = __reduce_add_sync(0xffffffff, local_count);
|
block_num_valid_tokens = __reduce_add_sync(0xffffffff, local_count);
|
||||||
#endif
|
|
||||||
|
|
||||||
if (lane_id == 0)
|
if (lane_id == 0)
|
||||||
reinterpret_cast<int*>(sh_new)[0] = block_num_valid_tokens;
|
reinterpret_cast<int*>(sh_new)[0] = block_num_valid_tokens;
|
||||||
@@ -528,8 +651,22 @@ __global__ void Marlin(
|
|||||||
if (par_id >= parallel) return;
|
if (par_id >= parallel) return;
|
||||||
|
|
||||||
old_expert_id = expert_id;
|
old_expert_id = expert_id;
|
||||||
block_id = par_id;
|
if (num_invalid_blocks > 0) {
|
||||||
expert_id = expert_ids_ptr[block_id];
|
int skip_count = par_id;
|
||||||
|
for (int i = 0; i < num_tokens_past_padded / moe_block_size; i++) {
|
||||||
|
expert_id = expert_ids_ptr[i];
|
||||||
|
if (expert_id != -1) {
|
||||||
|
if (skip_count == 0) {
|
||||||
|
block_id = i;
|
||||||
|
break;
|
||||||
|
};
|
||||||
|
skip_count--;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
block_id = par_id;
|
||||||
|
expert_id = expert_ids_ptr[block_id];
|
||||||
|
}
|
||||||
|
|
||||||
if constexpr (b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
|
if constexpr (b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
|
||||||
uint16_t val = global_scale_ptr[expert_id];
|
uint16_t val = global_scale_ptr[expert_id];
|
||||||
@@ -881,6 +1018,10 @@ __global__ void Marlin(
|
|||||||
constexpr int sh_s_size = has_act_order ? (act_s_max_num_groups * s_sh_stride)
|
constexpr int sh_s_size = has_act_order ? (act_s_max_num_groups * s_sh_stride)
|
||||||
: (stages * s_sh_stage);
|
: (stages * s_sh_stage);
|
||||||
int4* sh_s = sh_zp + (stages * zp_sh_stage);
|
int4* sh_s = sh_zp + (stages * zp_sh_stage);
|
||||||
|
// shared memory reused by reduction should be smaller than
|
||||||
|
// shared memory used by weight.
|
||||||
|
static_assert(thread_m_blocks * 16 * thread_n_blocks * 16 / 8 <=
|
||||||
|
stages * b_sh_stage);
|
||||||
int4* sh_a = sh_s + sh_s_size;
|
int4* sh_a = sh_s + sh_s_size;
|
||||||
|
|
||||||
// Register storage for double buffer of shared memory reads.
|
// Register storage for double buffer of shared memory reads.
|
||||||
@@ -1404,13 +1545,11 @@ __global__ void Marlin(
|
|||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < thread_m_blocks; i++) {
|
for (int i = 0; i < thread_m_blocks; i++) {
|
||||||
if constexpr (m_block_size_8) {
|
if constexpr (m_block_size_8) {
|
||||||
mma_trans<a_type_id, use_fp16_accum>(frag_a[k2][i], frag_b0, frag_b1,
|
mma_trans<a_type_id>(frag_a[k2][i], frag_b0, frag_b1,
|
||||||
frag_c[i][j][0]);
|
frag_c[i][j][0]);
|
||||||
} else {
|
} else {
|
||||||
mma<a_type_id, use_fp16_accum>(frag_a[k2][i], frag_b0,
|
mma<a_type_id>(frag_a[k2][i], frag_b0, frag_c[i][j][0]);
|
||||||
frag_c[i][j][0]);
|
mma<a_type_id>(frag_a[k2][i], frag_b1, frag_c[i][j][1]);
|
||||||
mma<a_type_id, use_fp16_accum>(frag_a[k2][i], frag_b1,
|
|
||||||
frag_c[i][j][1]);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1444,12 +1583,10 @@ __global__ void Marlin(
|
|||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < thread_m_blocks; i++) {
|
for (int i = 0; i < thread_m_blocks; i++) {
|
||||||
mma<a_type_id, false, 32>(
|
mma<a_type_id, 32>(frag_a[k2][i], frag_b[0],
|
||||||
frag_a[k2][i], frag_b[0],
|
(group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][0]);
|
||||||
(group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][0]);
|
mma<a_type_id, 32>(frag_a[k2][i], frag_b[1],
|
||||||
mma<a_type_id, false, 32>(
|
(group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][1]);
|
||||||
frag_a[k2][i], frag_b[1],
|
|
||||||
(group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][1]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if constexpr (group_blocks != -1) {
|
if constexpr (group_blocks != -1) {
|
||||||
@@ -1995,21 +2132,6 @@ __global__ void Marlin(
|
|||||||
// While this pattern may not be the most readable, other ways of writing
|
// While this pattern may not be the most readable, other ways of writing
|
||||||
// the loop seemed to noticeably worse performance after compilation.
|
// the loop seemed to noticeably worse performance after compilation.
|
||||||
if (slice_iters == 0) {
|
if (slice_iters == 0) {
|
||||||
// convert fp16 accum to fp32 for reduction
|
|
||||||
if constexpr (use_fp16_accum) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < (thread_m_blocks * (is_a_8bit ? 2 : 4) * 2); i++) {
|
|
||||||
float* frag_c_part_float = reinterpret_cast<float*>(frag_c) + i * 4;
|
|
||||||
scalar_t* frag_c_part_half =
|
|
||||||
reinterpret_cast<scalar_t*>(frag_c_part_float);
|
|
||||||
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 3; i >= 0; i--) {
|
|
||||||
frag_c_part_float[i] = Cdtype::num2float(frag_c_part_half[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if constexpr (is_a_8bit) {
|
if constexpr (is_a_8bit) {
|
||||||
float frag_a_s[2 * thread_m_blocks];
|
float frag_a_s[2 * thread_m_blocks];
|
||||||
|
|
||||||
|
|||||||
@@ -142,7 +142,7 @@ typedef struct {
|
|||||||
|
|
||||||
int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
|
int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
|
||||||
int prob_n, int prob_k, int num_bits, int group_size,
|
int prob_n, int prob_k, int num_bits, int group_size,
|
||||||
bool has_act_order, bool is_k_full, int stages) {
|
bool has_act_order, bool is_k_full) {
|
||||||
bool cache_scales_chunk = has_act_order && !is_k_full;
|
bool cache_scales_chunk = has_act_order && !is_k_full;
|
||||||
|
|
||||||
int tb_n = th_config.thread_n;
|
int tb_n = th_config.thread_n;
|
||||||
@@ -160,13 +160,13 @@ int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
|
|||||||
|
|
||||||
if (cache_scales_chunk) {
|
if (cache_scales_chunk) {
|
||||||
int load_groups =
|
int load_groups =
|
||||||
tb_groups * stages * 2; // Chunk size is 2x pipeline over dim K
|
tb_groups * pipe_stages * 2; // Chunk size is 2x pipeline over dim K
|
||||||
load_groups = max(load_groups, 32); // We load at least 32 scale groups
|
load_groups = max(load_groups, 32); // We load at least 32 scale groups
|
||||||
return load_groups * tb_n * 2;
|
return load_groups * tb_n * 2;
|
||||||
} else {
|
} else {
|
||||||
int tb_scales = tb_groups * tb_n * 2;
|
int tb_scales = tb_groups * tb_n * 2;
|
||||||
|
|
||||||
return tb_scales * stages;
|
return tb_scales * pipe_stages;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -174,7 +174,7 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
|
|||||||
int thread_m_blocks, int prob_m, int prob_n,
|
int thread_m_blocks, int prob_m, int prob_n,
|
||||||
int prob_k, int num_bits, int group_size,
|
int prob_k, int num_bits, int group_size,
|
||||||
bool has_act_order, bool is_k_full, int has_zp,
|
bool has_act_order, bool is_k_full, int has_zp,
|
||||||
int is_zp_float, bool is_a_8bit, int stages) {
|
int is_zp_float, bool is_a_8bit) {
|
||||||
int pack_factor = 32 / num_bits;
|
int pack_factor = 32 / num_bits;
|
||||||
|
|
||||||
// Get B size
|
// Get B size
|
||||||
@@ -185,8 +185,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
|
|||||||
// shm size for block_sorted_ids/rd_block_sorted_ids/block_topk_weights
|
// shm size for block_sorted_ids/rd_block_sorted_ids/block_topk_weights
|
||||||
// both of them requires tb_m * 4 bytes (tb_m * int32 or tb_m * float32)
|
// both of them requires tb_m * 4 bytes (tb_m * int32 or tb_m * float32)
|
||||||
int sh_block_meta_size = tb_m * 16;
|
int sh_block_meta_size = tb_m * 16;
|
||||||
int sh_a_size = stages * (tb_m * tb_k) * (is_a_8bit ? 1 : 2);
|
int sh_a_size = pipe_stages * (tb_m * tb_k) * (is_a_8bit ? 1 : 2);
|
||||||
int sh_b_size = stages * (tb_k * tb_n / pack_factor) * 4;
|
int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4;
|
||||||
int sh_red_size = tb_m * (tb_n + 8) * 2;
|
int sh_red_size = tb_m * (tb_n + 8) * 2;
|
||||||
int sh_bias_size = tb_n * 2;
|
int sh_bias_size = tb_n * 2;
|
||||||
int tmp_size =
|
int tmp_size =
|
||||||
@@ -195,8 +195,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
|
|||||||
|
|
||||||
int sh_s_size =
|
int sh_s_size =
|
||||||
get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
|
get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
|
||||||
group_size, has_act_order, is_k_full, stages);
|
group_size, has_act_order, is_k_full);
|
||||||
int sh_g_idx_size = has_act_order && !is_k_full ? stages * tb_k / 4 : 0;
|
int sh_g_idx_size = has_act_order && !is_k_full ? pipe_stages * tb_k / 4 : 0;
|
||||||
int sh_zp_size = 0;
|
int sh_zp_size = 0;
|
||||||
if (has_zp) {
|
if (has_zp) {
|
||||||
if (is_zp_float)
|
if (is_zp_float)
|
||||||
@@ -217,7 +217,7 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
|
|||||||
int thread_m_blocks, int prob_m, int prob_n, int prob_k,
|
int thread_m_blocks, int prob_m, int prob_n, int prob_k,
|
||||||
int num_bits, int group_size, bool has_act_order,
|
int num_bits, int group_size, bool has_act_order,
|
||||||
bool is_k_full, int has_zp, int is_zp_float,
|
bool is_k_full, int has_zp, int is_zp_float,
|
||||||
bool is_a_8bit, int stages, int max_shared_mem) {
|
int max_shared_mem, bool is_a_8bit) {
|
||||||
// Sanity
|
// Sanity
|
||||||
if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
|
if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
|
||||||
th_config.num_threads == -1) {
|
th_config.num_threads == -1) {
|
||||||
@@ -243,7 +243,7 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
|
|||||||
int cache_size =
|
int cache_size =
|
||||||
get_kernel_cache_size(th_config, m_block_size_8, thread_m_blocks, prob_m,
|
get_kernel_cache_size(th_config, m_block_size_8, thread_m_blocks, prob_m,
|
||||||
prob_n, prob_k, num_bits, group_size, has_act_order,
|
prob_n, prob_k, num_bits, group_size, has_act_order,
|
||||||
is_k_full, has_zp, is_zp_float, is_a_8bit, stages);
|
is_k_full, has_zp, is_zp_float, is_a_8bit);
|
||||||
return cache_size <= max_shared_mem;
|
return cache_size <= max_shared_mem;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -252,7 +252,7 @@ MarlinFuncPtr get_marlin_kernel(
|
|||||||
const vllm::ScalarType c_type, const vllm::ScalarType s_type,
|
const vllm::ScalarType c_type, const vllm::ScalarType s_type,
|
||||||
int thread_m_blocks, int thread_n_blocks, int thread_k_blocks,
|
int thread_m_blocks, int thread_n_blocks, int thread_k_blocks,
|
||||||
bool m_block_size_8, bool has_act_order, bool has_zp, int group_blocks,
|
bool m_block_size_8, bool has_act_order, bool has_zp, int group_blocks,
|
||||||
int threads, bool is_zp_float, int stages) {
|
int threads, bool is_zp_float) {
|
||||||
int num_bits = b_type.size_bits();
|
int num_bits = b_type.size_bits();
|
||||||
auto kernel = MarlinDefault;
|
auto kernel = MarlinDefault;
|
||||||
|
|
||||||
@@ -266,8 +266,8 @@ exec_config_t determine_exec_config(
|
|||||||
const vllm::ScalarType& c_type, const vllm::ScalarType& s_type, int prob_m,
|
const vllm::ScalarType& c_type, const vllm::ScalarType& s_type, int prob_m,
|
||||||
int prob_n, int prob_k, int num_experts, int top_k, int thread_m_blocks,
|
int prob_n, int prob_k, int num_experts, int top_k, int thread_m_blocks,
|
||||||
bool m_block_size_8, int num_bits, int group_size, bool has_act_order,
|
bool m_block_size_8, int num_bits, int group_size, bool has_act_order,
|
||||||
bool is_k_full, bool has_zp, bool is_zp_float, bool is_a_8bit, int stages,
|
bool is_k_full, bool has_zp, bool is_zp_float, int max_shared_mem, int sms,
|
||||||
int max_shared_mem, int sms) {
|
bool is_a_8bit) {
|
||||||
exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}};
|
exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}};
|
||||||
thread_config_t* thread_configs = thread_m_blocks > 1
|
thread_config_t* thread_configs = thread_m_blocks > 1
|
||||||
? large_batch_thread_configs
|
? large_batch_thread_configs
|
||||||
@@ -284,15 +284,15 @@ exec_config_t determine_exec_config(
|
|||||||
|
|
||||||
if (!is_valid_config(th_config, m_block_size_8, thread_m_blocks, prob_m,
|
if (!is_valid_config(th_config, m_block_size_8, thread_m_blocks, prob_m,
|
||||||
prob_n, prob_k, num_bits, group_size, has_act_order,
|
prob_n, prob_k, num_bits, group_size, has_act_order,
|
||||||
is_k_full, has_zp, is_zp_float, is_a_8bit, stages,
|
is_k_full, has_zp, is_zp_float, max_shared_mem - 512,
|
||||||
max_shared_mem - 512)) {
|
is_a_8bit)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
int cache_size = get_kernel_cache_size(
|
int cache_size = get_kernel_cache_size(
|
||||||
th_config, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k,
|
th_config, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k,
|
||||||
num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float,
|
num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float,
|
||||||
is_a_8bit, stages);
|
is_a_8bit);
|
||||||
|
|
||||||
int group_blocks = 0;
|
int group_blocks = 0;
|
||||||
if (!has_act_order) {
|
if (!has_act_order) {
|
||||||
@@ -303,7 +303,7 @@ exec_config_t determine_exec_config(
|
|||||||
get_marlin_kernel(a_type, b_type, c_type, s_type, thread_m_blocks,
|
get_marlin_kernel(a_type, b_type, c_type, s_type, thread_m_blocks,
|
||||||
th_config.thread_n / 16, th_config.thread_k / 16,
|
th_config.thread_n / 16, th_config.thread_k / 16,
|
||||||
m_block_size_8, has_act_order, has_zp, group_blocks,
|
m_block_size_8, has_act_order, has_zp, group_blocks,
|
||||||
th_config.num_threads, is_zp_float, stages);
|
th_config.num_threads, is_zp_float);
|
||||||
|
|
||||||
if (kernel == MarlinDefault) continue;
|
if (kernel == MarlinDefault) continue;
|
||||||
|
|
||||||
@@ -336,14 +336,14 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
|
|||||||
void* perm, void* a_tmp, void* sorted_token_ids,
|
void* perm, void* a_tmp, void* sorted_token_ids,
|
||||||
void* expert_ids, void* num_tokens_past_padded,
|
void* expert_ids, void* num_tokens_past_padded,
|
||||||
void* topk_weights, int moe_block_size, int num_experts,
|
void* topk_weights, int moe_block_size, int num_experts,
|
||||||
int top_k, bool mul_topk_weights, int prob_m, int prob_n,
|
int top_k, bool mul_topk_weights, bool is_ep, int prob_m,
|
||||||
int prob_k, void* workspace, vllm::ScalarType const& a_type,
|
int prob_n, int prob_k, void* workspace,
|
||||||
vllm::ScalarType const& b_type, vllm::ScalarType const& c_type,
|
vllm::ScalarType const& a_type, vllm::ScalarType const& b_type,
|
||||||
vllm::ScalarType const& s_type, bool has_bias,
|
vllm::ScalarType const& c_type, vllm::ScalarType const& s_type,
|
||||||
bool has_act_order, bool is_k_full, bool has_zp, int num_groups,
|
bool has_bias, bool has_act_order, bool is_k_full, bool has_zp,
|
||||||
int group_size, int dev, cudaStream_t stream, int thread_k,
|
int num_groups, int group_size, int dev, cudaStream_t stream,
|
||||||
int thread_n, int sms, int blocks_per_sm, bool use_atomic_add,
|
int thread_k, int thread_n, int sms, int blocks_per_sm,
|
||||||
bool use_fp32_reduce, bool is_zp_float) {
|
bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) {
|
||||||
int thread_m_blocks = div_ceil(moe_block_size, 16);
|
int thread_m_blocks = div_ceil(moe_block_size, 16);
|
||||||
bool m_block_size_8 = moe_block_size == 8;
|
bool m_block_size_8 = moe_block_size == 8;
|
||||||
bool is_a_8bit = a_type.size_bits() == 8;
|
bool is_a_8bit = a_type.size_bits() == 8;
|
||||||
@@ -433,14 +433,8 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
|
|||||||
dev);
|
dev);
|
||||||
cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
|
cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
|
||||||
dev);
|
dev);
|
||||||
TORCH_CHECK(major_capability * 10 + minor_capability >= 75,
|
TORCH_CHECK(major_capability * 10 + minor_capability >= 80,
|
||||||
"marlin kernel only support Turing or newer GPUs.");
|
"marlin kernel only support Ampere or newer GPUs.");
|
||||||
int stages = 4;
|
|
||||||
if (major_capability == 7 && minor_capability == 5) {
|
|
||||||
stages = 2;
|
|
||||||
TORCH_CHECK(a_type == vllm::kFloat16 || a_type == vllm::kS8,
|
|
||||||
"Turing only support FP16 or INT8 activation.");
|
|
||||||
}
|
|
||||||
if (a_type == vllm::kFE4M3fn) {
|
if (a_type == vllm::kFE4M3fn) {
|
||||||
TORCH_CHECK(major_capability * 10 + minor_capability >= 89,
|
TORCH_CHECK(major_capability * 10 + minor_capability >= 89,
|
||||||
"FP8 only support Ada Lovelace or newer GPUs.");
|
"FP8 only support Ada Lovelace or newer GPUs.");
|
||||||
@@ -467,8 +461,8 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
|
|||||||
exec_cfg = determine_exec_config(
|
exec_cfg = determine_exec_config(
|
||||||
a_type, b_type, c_type, s_type, prob_m, prob_n, prob_k, num_experts,
|
a_type, b_type, c_type, s_type, prob_m, prob_n, prob_k, num_experts,
|
||||||
top_k, thread_m_blocks, m_block_size_8, num_bits, group_size,
|
top_k, thread_m_blocks, m_block_size_8, num_bits, group_size,
|
||||||
has_act_order, is_k_full, has_zp, is_zp_float, is_a_8bit, stages,
|
has_act_order, is_k_full, has_zp, is_zp_float, max_shared_mem, sms,
|
||||||
max_shared_mem, sms);
|
is_a_8bit);
|
||||||
thread_tfg = exec_cfg.tb_cfg;
|
thread_tfg = exec_cfg.tb_cfg;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -485,7 +479,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
|
|||||||
TORCH_CHECK(is_valid_config(thread_tfg, m_block_size_8, thread_m_blocks,
|
TORCH_CHECK(is_valid_config(thread_tfg, m_block_size_8, thread_m_blocks,
|
||||||
prob_m, prob_n, prob_k, num_bits, group_size,
|
prob_m, prob_n, prob_k, num_bits, group_size,
|
||||||
has_act_order, is_k_full, has_zp, is_zp_float,
|
has_act_order, is_k_full, has_zp, is_zp_float,
|
||||||
is_a_8bit, stages, max_shared_mem),
|
max_shared_mem, is_a_8bit),
|
||||||
"Invalid thread config: thread_m_blocks = ", thread_m_blocks,
|
"Invalid thread config: thread_m_blocks = ", thread_m_blocks,
|
||||||
", thread_k = ", thread_tfg.thread_k,
|
", thread_k = ", thread_tfg.thread_k,
|
||||||
", thread_n = ", thread_tfg.thread_n,
|
", thread_n = ", thread_tfg.thread_n,
|
||||||
@@ -499,12 +493,12 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
|
|||||||
int sh_cache_size =
|
int sh_cache_size =
|
||||||
get_kernel_cache_size(thread_tfg, m_block_size_8, thread_m_blocks, prob_m,
|
get_kernel_cache_size(thread_tfg, m_block_size_8, thread_m_blocks, prob_m,
|
||||||
prob_n, prob_k, num_bits, group_size, has_act_order,
|
prob_n, prob_k, num_bits, group_size, has_act_order,
|
||||||
is_k_full, has_zp, is_zp_float, is_a_8bit, stages);
|
is_k_full, has_zp, is_zp_float, is_a_8bit);
|
||||||
|
|
||||||
auto kernel = get_marlin_kernel(
|
auto kernel = get_marlin_kernel(
|
||||||
a_type, b_type, c_type, s_type, thread_m_blocks, thread_n_blocks,
|
a_type, b_type, c_type, s_type, thread_m_blocks, thread_n_blocks,
|
||||||
thread_k_blocks, m_block_size_8, has_act_order, has_zp, group_blocks,
|
thread_k_blocks, m_block_size_8, has_act_order, has_zp, group_blocks,
|
||||||
num_threads, is_zp_float, stages);
|
num_threads, is_zp_float);
|
||||||
|
|
||||||
if (kernel == MarlinDefault) {
|
if (kernel == MarlinDefault) {
|
||||||
TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
|
TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
|
||||||
@@ -523,7 +517,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
|
|||||||
kernel<<<blocks, num_threads, max_shared_mem, stream>>>(
|
kernel<<<blocks, num_threads, max_shared_mem, stream>>>(
|
||||||
A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, a_s_ptr, b_s_ptr, g_s_ptr, zp_ptr, g_idx_ptr,
|
A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, a_s_ptr, b_s_ptr, g_s_ptr, zp_ptr, g_idx_ptr,
|
||||||
sorted_token_ids_ptr, expert_ids_ptr, num_tokens_past_padded_ptr,
|
sorted_token_ids_ptr, expert_ids_ptr, num_tokens_past_padded_ptr,
|
||||||
topk_weights_ptr, top_k, mul_topk_weights, num_groups, prob_m,
|
topk_weights_ptr, top_k, mul_topk_weights, is_ep, num_groups, prob_m,
|
||||||
prob_n, prob_k, locks, has_bias, use_atomic_add, use_fp32_reduce);
|
prob_n, prob_k, locks, has_bias, use_atomic_add, use_fp32_reduce);
|
||||||
// clang-format on
|
// clang-format on
|
||||||
}
|
}
|
||||||
@@ -541,7 +535,7 @@ torch::Tensor moe_wna16_marlin_gemm(
|
|||||||
std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
|
std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
|
||||||
torch::Tensor& sorted_token_ids, torch::Tensor& expert_ids,
|
torch::Tensor& sorted_token_ids, torch::Tensor& expert_ids,
|
||||||
torch::Tensor& num_tokens_past_padded, torch::Tensor& topk_weights,
|
torch::Tensor& num_tokens_past_padded, torch::Tensor& topk_weights,
|
||||||
int64_t moe_block_size, int64_t top_k, bool mul_topk_weights,
|
int64_t moe_block_size, int64_t top_k, bool mul_topk_weights, bool is_ep,
|
||||||
vllm::ScalarTypeId const& b_type_id, int64_t size_m, int64_t size_n,
|
vllm::ScalarTypeId const& b_type_id, int64_t size_m, int64_t size_n,
|
||||||
int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
|
int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
|
||||||
bool is_zp_float, int64_t thread_k, int64_t thread_n,
|
bool is_zp_float, int64_t thread_k, int64_t thread_n,
|
||||||
@@ -855,9 +849,9 @@ torch::Tensor moe_wna16_marlin_gemm(
|
|||||||
perm.data_ptr(), a_tmp.data_ptr(), sorted_token_ids.data_ptr(),
|
perm.data_ptr(), a_tmp.data_ptr(), sorted_token_ids.data_ptr(),
|
||||||
expert_ids.data_ptr(), num_tokens_past_padded.data_ptr(),
|
expert_ids.data_ptr(), num_tokens_past_padded.data_ptr(),
|
||||||
topk_weights.data_ptr(), moe_block_size, num_experts, top_k,
|
topk_weights.data_ptr(), moe_block_size, num_experts, top_k,
|
||||||
mul_topk_weights, size_m, size_n, size_k, workspace.data_ptr(), a_type,
|
mul_topk_weights, is_ep, size_m, size_n, size_k, workspace.data_ptr(),
|
||||||
b_type, c_type, s_type, has_bias, has_act_order, is_k_full, has_zp,
|
a_type, b_type, c_type, s_type, has_bias, has_act_order, is_k_full,
|
||||||
num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
|
has_zp, num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
|
||||||
thread_k, thread_n, sms, blocks_per_sm, use_atomic_add, use_fp32_reduce,
|
thread_k, thread_n, sms, blocks_per_sm, use_atomic_add, use_fp32_reduce,
|
||||||
is_zp_float);
|
is_zp_float);
|
||||||
|
|
||||||
@@ -866,4 +860,4 @@ torch::Tensor moe_wna16_marlin_gemm(
|
|||||||
|
|
||||||
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
||||||
m.impl("moe_wna16_marlin_gemm", &moe_wna16_marlin_gemm);
|
m.impl("moe_wna16_marlin_gemm", &moe_wna16_marlin_gemm);
|
||||||
}
|
}
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user