diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py
index 68aff793a..76f6d7aec 100644
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@@ -5,11 +5,11 @@ import os
import sys
import zipfile
-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
-# Note that we have 400 MiB quota, please use it wisely.
-# See https://github.com/pypi/support/issues/3792 .
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
+# Note that we have 800 MiB quota, please use it wisely.
+# See https://github.com/pypi/support/issues/6326 .
# Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))
+VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
def print_top_10_largest_files(zip_file):
diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
index 50431d0cd..5ea5a50a2 100644
--- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@@ -218,7 +218,7 @@ if __name__ == "__main__":
"--xaxis",
type=str,
default="# of max concurrency.",
- help="column name to use as X Axis in comparision graph",
+ help="column name to use as X Axis in comparison graph",
)
args = parser.parse_args()
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
index 2d88a0b30..f758097e0 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
@@ -1,6 +1,6 @@
[
{
- "test_name": "serving_llama8B_tp1_sharegpt",
+ "test_name": "serving_llama8B_bf16_tp1_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
@@ -32,7 +32,7 @@
}
},
{
- "test_name": "serving_llama8B_tp2_sharegpt",
+ "test_name": "serving_llama8B_bf16_tp2_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
@@ -64,7 +64,7 @@
}
},
{
- "test_name": "serving_llama8B_tp4_sharegpt",
+ "test_name": "serving_llama8B_bf16_tp4_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
@@ -96,7 +96,7 @@
}
},
{
- "test_name": "serving_llama8B_tp1_random_128_128",
+ "test_name": "serving_llama8B_bf16_tp1_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
@@ -131,7 +131,7 @@
}
},
{
- "test_name": "serving_llama8B_tp2_random_128_128",
+ "test_name": "serving_llama8B_bf16_tp2_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
@@ -166,7 +166,7 @@
}
},
{
- "test_name": "serving_llama8B_tp4_random_128_128",
+ "test_name": "serving_llama8B_bf16_tp4_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
@@ -198,5 +198,413 @@
"random-output-len": 128,
"num_prompts": 1000
}
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp1_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp2_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp4_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 4,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp1_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp2_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp4_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 4,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp1_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp2_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp4_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 4,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp1_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp2_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp4_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 4,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
}
]
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
index 823abbaa9..ce396d6e5 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
@@ -1,6 +1,6 @@
[
{
- "test_name": "serving_llama8B_pp1_sharegpt",
+ "test_name": "serving_llama8B_bf16_pp1_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
@@ -32,7 +32,39 @@
}
},
{
- "test_name": "serving_llama8B_pp3_sharegpt",
+ "test_name": "serving_llama8B_bf16_tp2_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_bf16_pp3_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
@@ -64,7 +96,7 @@
}
},
{
- "test_name": "serving_llama8B_tp2pp3_sharegpt",
+ "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
@@ -97,7 +129,7 @@
}
},
{
- "test_name": "serving_llama8B_pp1_random_128_128",
+ "test_name": "serving_llama8B_bf16_pp1_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
@@ -132,7 +164,42 @@
}
},
{
- "test_name": "serving_llama8B_pp3_random_128_128",
+ "test_name": "serving_llama8B_bf16_tp2_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_bf16_pp3_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
@@ -167,7 +234,7 @@
}
},
{
- "test_name": "serving_llama8B_tp2pp3_random_128_128",
+ "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
@@ -201,5 +268,553 @@
"ignore-eos": "",
"num_prompts": 1000
}
+ },
+ {
+ "test_name": "serving_llama8B_int8_pp1_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "pipeline_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp2_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_pp3_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 2,
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_pp1_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "pipeline_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp2_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_pp3_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 2,
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_pp1_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "pipeline_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp2_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_pp3_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 2,
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_pp1_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "pipeline_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp2_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_pp3_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 2,
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
}
]
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index f96c38bf5..a1de41652 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,21 +1,24 @@
steps:
- # aarch64 + CUDA builds
- - label: "Build arm64 wheel - CUDA 12.8"
- id: build-wheel-arm64-cuda-12-8
+ # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
+ - label: "Build arm64 wheel - CUDA 12.9"
+ id: build-wheel-arm64-cuda-12-9
agents:
queue: arm64_cpu_queue_postmerge
commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
- # x86 + CUDA builds
+ - block: "Build CUDA 12.8 wheel"
+ key: block-build-cu128-wheel
+
- label: "Build wheel - CUDA 12.8"
+ depends_on: block-build-cu128-wheel
id: build-wheel-cuda-12-8
agents:
queue: cpu_queue_postmerge
@@ -44,44 +47,63 @@ steps:
env:
DOCKER_BUILDKIT: "1"
- # Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
- # However, this block can be uncommented to save some compute hours.
- # - block: "Build CUDA 11.8 wheel"
- # key: block-build-cu118-wheel
-
- - label: "Build wheel - CUDA 11.8"
- # depends_on: block-build-cu118-wheel
- id: build-wheel-cuda-11-8
+ # x86 + CUDA builds
+ - label: "Build wheel - CUDA 12.9"
+ depends_on: ~
+ id: build-wheel-cuda-12-9
agents:
queue: cpu_queue_postmerge
commands:
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
- - block: "Build release image"
+ - label: "Build release image (x86)"
depends_on: ~
- key: block-release-image-build
-
- - label: "Build release image"
- depends_on: block-release-image-build
- id: build-release-image
+ id: build-release-image-x86
agents:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+ - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
+ # re-tag to default image tag and push, just in case arm64 build fails
+ - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+ # PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
+ - label: "Build release image (arm64)"
+ depends_on: ~
+ id: build-release-image-arm64
+ agents:
+ queue: arm64_cpu_queue_postmerge
+ commands:
+ - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+ - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
+
+ # Add job to create multi-arch manifest
+ - label: "Create multi-arch manifest"
+ depends_on:
+ - build-release-image-x86
+ - build-release-image-arm64
+ id: create-multi-arch-manifest
+ agents:
+ queue: cpu_queue_postmerge
+ commands:
+ - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+ - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
+ - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+
- label: "Annotate release workflow"
depends_on:
- - build-release-image
+ - create-multi-arch-manifest
- build-wheel-cuda-12-8
- build-wheel-cuda-12-6
- - build-wheel-cuda-11-8
+ - build-wheel-cuda-12-9
id: annotate-release-workflow
agents:
queue: cpu_queue_postmerge
@@ -128,18 +150,24 @@ steps:
env:
DOCKER_BUILDKIT: "1"
- - block: "Build Neuron release image"
- key: block-neuron-release-image-build
- depends_on: ~
-
- - label: "Build and publish Neuron release image"
- depends_on: block-neuron-release-image-build
+ - label: "Build and publish nightly multi-arch image to DockerHub"
+ depends_on:
+ - create-multi-arch-manifest
+ if: build.env("NIGHTLY") == "1"
agents:
- queue: neuron-postmerge
+ queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
- - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest"
- - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
+ - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+ - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly"
+ - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
+ - "docker push vllm/vllm-openai:nightly"
+ - "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
+ # Clean up old nightly builds (keep only last 14)
+ - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
+ plugins:
+ - docker-login#v3.0.0:
+ username: vllmbot
+ password-env: DOCKERHUB_TOKEN
env:
DOCKER_BUILDKIT: "1"
diff --git a/.buildkite/scripts/cleanup-nightly-builds.sh b/.buildkite/scripts/cleanup-nightly-builds.sh
new file mode 100755
index 000000000..1a82f7d08
--- /dev/null
+++ b/.buildkite/scripts/cleanup-nightly-builds.sh
@@ -0,0 +1,97 @@
+#!/bin/bash
+
+set -ex
+
+# Clean up old nightly builds from DockerHub, keeping only the last 14 builds
+# This script uses DockerHub API to list and delete old tags with "nightly-" prefix
+
+# DockerHub API endpoint for vllm/vllm-openai repository
+REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
+
+# Get DockerHub token from environment
+if [ -z "$DOCKERHUB_TOKEN" ]; then
+ echo "Error: DOCKERHUB_TOKEN environment variable is not set"
+ exit 1
+fi
+
+# Function to get all tags from DockerHub
+get_all_tags() {
+ local page=1
+ local all_tags=""
+
+ while true; do
+ local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \
+ "$REPO_API_URL?page=$page&page_size=100")
+
+ # Get both last_updated timestamp and tag name, separated by |
+ local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
+
+ if [ -z "$tags" ]; then
+ break
+ fi
+
+ all_tags="$all_tags$tags"$'\n'
+ page=$((page + 1))
+ done
+
+ # Sort by timestamp (newest first) and extract just the tag names
+ echo "$all_tags" | sort -r | cut -d'|' -f2
+}
+
+delete_tag() {
+ local tag_name="$1"
+ echo "Deleting tag: $tag_name"
+
+ local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
+ local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url")
+
+ if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
+ echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
+ else
+ echo "Successfully deleted tag: $tag_name"
+ fi
+}
+
+# Get all nightly- prefixed tags, sorted by last_updated timestamp (newest first)
+echo "Fetching all tags from DockerHub..."
+all_tags=$(get_all_tags)
+
+if [ -z "$all_tags" ]; then
+ echo "No tags found to clean up"
+ exit 0
+fi
+
+# Count total tags
+total_tags=$(echo "$all_tags" | wc -l)
+echo "Found $total_tags tags"
+
+# Keep only the last 14 builds (including the current one)
+tags_to_keep=14
+tags_to_delete=$((total_tags - tags_to_keep))
+
+if [ $tags_to_delete -le 0 ]; then
+ echo "No tags need to be deleted (only $total_tags tags found, keeping $tags_to_keep)"
+ exit 0
+fi
+
+echo "Will delete $tags_to_delete old tags, keeping the newest $tags_to_keep"
+
+# Get tags to delete (skip the first $tags_to_keep tags)
+tags_to_delete_list=$(echo "$all_tags" | tail -n +$((tags_to_keep + 1)))
+
+if [ -z "$tags_to_delete_list" ]; then
+ echo "No tags to delete"
+ exit 0
+fi
+
+# Delete old tags
+echo "Deleting old tags..."
+while IFS= read -r tag; do
+ if [ -n "$tag" ]; then
+ delete_tag "$tag"
+ # Add a small delay to avoid rate limiting
+ sleep 1
+ fi
+done <<< "$tags_to_delete_list"
+
+echo "Cleanup completed successfully"
diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index df0bae0c9..c395011a2 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -164,7 +164,6 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
--ignore=entrypoints/llm/test_chat.py \
--ignore=entrypoints/llm/test_accuracy.py \
--ignore=entrypoints/llm/test_init.py \
- --ignore=entrypoints/llm/test_generate_multiple_loras.py \
--ignore=entrypoints/llm/test_prompt_validation.py "}
fi
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 9dec9f8e9..0f734763f 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -25,8 +25,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
function cpu_tests() {
set -e
@@ -49,23 +49,23 @@ function cpu_tests() {
# Run kernel tests
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
- pytest -v -s tests/kernels/test_onednn.py"
+ pytest -x -v -s tests/kernels/test_onednn.py"
# Run basic model test
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
# Note: disable until supports V1
- # pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
- # pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
+ # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
+ # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
# Note: disable Bart until supports V1
- pytest -v -s tests/models/language/generation -m cpu_model \
+ pytest -x -v -s tests/models/language/generation -m cpu_model \
--ignore=tests/models/language/generation/test_bart.py
- VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
+ VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
--ignore=tests/models/language/generation/test_bart.py
- pytest -v -s tests/models/language/pooling -m cpu_model
- pytest -v -s tests/models/multimodal/generation \
+ pytest -x -v -s tests/models/language/pooling -m cpu_model
+ pytest -x -v -s tests/models/multimodal/generation \
--ignore=tests/models/multimodal/generation/test_mllama.py \
--ignore=tests/models/multimodal/generation/test_pixtral.py \
-m cpu_model"
@@ -73,33 +73,49 @@ function cpu_tests() {
# Run compressed-tensor test
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
- pytest -s -v \
+ pytest -x -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
# Note: disable it until supports V1
# Run AWQ test
# docker exec cpu-test-"$NUMA_NODE" bash -c "
# set -e
- # VLLM_USE_V1=0 pytest -s -v \
+ # VLLM_USE_V1=0 pytest -x -s -v \
# tests/quantization/test_ipex_quant.py"
# Run multi-lora tests
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
- pytest -s -v \
+ pytest -x -s -v \
tests/lora/test_qwen2vl.py"
- # online serving
+ # online serving: tp+pp
docker exec cpu-test-"$NUMA_NODE" bash -c '
set -e
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
+ server_pid=$!
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
vllm bench serve \
--backend vllm \
--dataset-name random \
--model meta-llama/Llama-3.2-3B-Instruct \
--num-prompts 20 \
- --endpoint /v1/completions'
+ --endpoint /v1/completions
+ kill -s SIGTERM $server_pid &'
+
+ # online serving: tp+dp
+ docker exec cpu-test-"$NUMA_NODE" bash -c '
+ set -e
+ VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
+ server_pid=$!
+ timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+ vllm bench serve \
+ --backend vllm \
+ --dataset-name random \
+ --model meta-llama/Llama-3.2-3B-Instruct \
+ --num-prompts 20 \
+ --endpoint /v1/completions
+ kill -s SIGTERM $server_pid &'
}
# All of CPU tests are expected to be finished less than 40 mins.
diff --git a/.buildkite/scripts/hardware_ci/run-neuron-test.sh b/.buildkite/scripts/hardware_ci/run-neuron-test.sh
deleted file mode 100644
index a397457c8..000000000
--- a/.buildkite/scripts/hardware_ci/run-neuron-test.sh
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/bin/bash
-
-# This script build the Neuron docker image and run the API server inside the container.
-# It serves a sanity check for compilation and basic model usage.
-set -e
-set -v
-
-image_name="neuron/vllm-ci"
-container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
-
-HF_CACHE="$(realpath ~)/huggingface"
-mkdir -p "${HF_CACHE}"
-HF_MOUNT="/root/.cache/huggingface"
-HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
-
-NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
-mkdir -p "${NEURON_COMPILE_CACHE_URL}"
-NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
-
-# Try building the docker image
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
-
-# prune old image and containers to save disk space, and only once a day
-# by using a timestamp file in tmp.
-if [ -f /tmp/neuron-docker-build-timestamp ]; then
- last_build=$(cat /tmp/neuron-docker-build-timestamp)
- current_time=$(date +%s)
- if [ $((current_time - last_build)) -gt 86400 ]; then
- # Remove dangling images (those that are not tagged and not used by any container)
- docker image prune -f
- # Remove unused volumes / force the system prune for old images as well.
- docker volume prune -f && docker system prune -f
- echo "$current_time" > /tmp/neuron-docker-build-timestamp
- fi
-else
- date "+%s" > /tmp/neuron-docker-build-timestamp
-fi
-
-docker build -t "${image_name}" -f docker/Dockerfile.neuron .
-
-# Setup cleanup
-remove_docker_container() {
- docker image rm -f "${image_name}" || true;
-}
-trap remove_docker_container EXIT
-
-# Run the image
-docker run --rm -it --device=/dev/neuron0 --network bridge \
- -v "${HF_CACHE}:${HF_MOUNT}" \
- -e "HF_HOME=${HF_MOUNT}" \
- -e "HF_TOKEN=${HF_TOKEN}" \
- -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
- -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
- --name "${container_name}" \
- ${image_name} \
- /bin/bash -c "
- set -e; # Exit on first error
- python3 /workspace/vllm/examples/offline_inference/neuron.py;
- python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
- for f in /workspace/vllm/tests/neuron/2_core/*.py; do
- echo \"Running test file: \$f\";
- python3 -m pytest \$f -v --capture=tee-sys;
- done
- "
\ No newline at end of file
diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index 73f3e63fb..8c9b00990 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -30,10 +30,12 @@ docker run \
bash -c '
set -e
echo $ZE_AFFINITY_MASK
- VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
- VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
- VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
- VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+ pip install tblib==3.1.0
+ python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+ python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
+ python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
+ python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+ VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
cd tests
pytest -v -s v1/core
pytest -v -s v1/engine
diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index 745f285c0..43aa8c47b 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -58,14 +58,15 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-if [[ $normal_wheel == *"cu118"* ]]; then
- # if $normal_wheel matches cu118, do not upload the index.html
- echo "Skipping index files for cu118 wheels"
-elif [[ $normal_wheel == *"cu126"* ]]; then
+if [[ $normal_wheel == *"cu126"* ]]; then
# if $normal_wheel matches cu126, do not upload the index.html
echo "Skipping index files for cu126 wheels"
+elif [[ $normal_wheel == *"cu128"* ]]; then
+ # if $normal_wheel matches cu128, do not upload the index.html
+ echo "Skipping index files for cu128 wheels"
else
- # only upload index.html for cu128 wheels (default wheels)
+ # only upload index.html for cu129 wheels (default wheels) as it
+ # is available on both x86 and arm64
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
fi
@@ -74,14 +75,15 @@ fi
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
-if [[ $normal_wheel == *"cu118"* ]]; then
- # if $normal_wheel matches cu118, do not upload the index.html
- echo "Skipping index files for cu118 wheels"
-elif [[ $normal_wheel == *"cu126"* ]]; then
+if [[ $normal_wheel == *"cu126"* ]]; then
# if $normal_wheel matches cu126, do not upload the index.html
echo "Skipping index files for cu126 wheels"
+elif [[ $normal_wheel == *"cu128"* ]]; then
+ # if $normal_wheel matches cu128, do not upload the index.html
+ echo "Skipping index files for cu128 wheels"
else
- # only upload index.html for cu128 wheels (default wheels)
+ # only upload index.html for cu129 wheels (default wheels) as it
+ # is available on both x86 and arm64
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
fi
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 0d3b7a294..b0f5fe418 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -41,7 +41,8 @@ steps:
commands:
- bash standalone_tests/pytorch_nightly_dependency.sh
-- label: Async Engine, Inputs, Utils, Worker Test # 24min
+- label: Async Engine, Inputs, Utils, Worker Test # 36min
+ timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
@@ -53,6 +54,7 @@ steps:
- tests/utils_
- tests/worker
- tests/standalone_tests/lazy_imports.py
+ - tests/transformers_utils
commands:
- python3 standalone_tests/lazy_imports.py
- pytest -v -s mq_llm_engine # MQLLMEngine
@@ -62,8 +64,10 @@ steps:
- pytest -v -s multimodal
- pytest -v -s utils_ # Utils
- pytest -v -s worker # Worker
+ - pytest -v -s transformers_utils # transformers_utils
-- label: Python-only Installation Test
+- label: Python-only Installation Test # 10min
+ timeout_in_minutes: 20
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- tests/standalone_tests/python_only_compile.sh
@@ -71,7 +75,8 @@ steps:
commands:
- bash standalone_tests/python_only_compile.sh
-- label: Basic Correctness Test # 30min
+- label: Basic Correctness Test # 20min
+ timeout_in_minutes: 30
mirror_hardwares: [amdexperimental]
fast_check: true
torch_nightly: true
@@ -88,7 +93,8 @@ steps:
- pytest -v -s basic_correctness/test_cpu_offload.py
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
-- label: Core Test # 10min
+- label: Core Test # 22min
+ timeout_in_minutes: 35
mirror_hardwares: [amdexperimental]
fast_check: true
source_file_dependencies:
@@ -98,7 +104,19 @@ steps:
commands:
- pytest -v -s core
-- label: Entrypoints Test (LLM) # 40min
+- label: Entrypoints Unit Tests # 5min
+ timeout_in_minutes: 10
+ working_dir: "/vllm-workspace/tests"
+ fast_check: true
+ source_file_dependencies:
+ - vllm/entrypoints
+ - tests/entrypoints/
+ commands:
+ - pytest -v -s entrypoints/openai/tool_parsers
+ - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
+
+- label: Entrypoints Integration Test (LLM) # 30min
+ timeout_in_minutes: 40
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
fast_check: true
@@ -109,13 +127,13 @@ steps:
- tests/entrypoints/offline_mode
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_collective_rpc.py
+ - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
-- label: Entrypoints Test (API Server) # 40min
+- label: Entrypoints Integration Test (API Server) # 100min
+ timeout_in_minutes: 130
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
fast_check: true
@@ -127,10 +145,24 @@ steps:
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
- - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py
+ - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
- pytest -v -s entrypoints/test_chat_utils.py
-- label: Distributed Tests (4 GPUs) # 10min
+- label: Entrypoints Integration Test (Pooling)
+ timeout_in_minutes: 50
+ mirror_hardwares: [amdexperimental]
+ working_dir: "/vllm-workspace/tests"
+ fast_check: true
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/entrypoints/pooling
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s entrypoints/pooling
+
+- label: Distributed Tests (4 GPUs) # 35min
+ timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 4
@@ -173,7 +205,8 @@ steps:
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
- popd
-- label: EPLB Algorithm Test
+- label: EPLB Algorithm Test # 5min
+ timeout_in_minutes: 15
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/distributed/eplb
@@ -182,6 +215,7 @@ steps:
- pytest -v -s distributed/test_eplb_algo.py
- label: EPLB Execution Test # 5min
+ timeout_in_minutes: 15
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
@@ -190,13 +224,14 @@ steps:
commands:
- pytest -v -s distributed/test_eplb_execute.py
-- label: Metrics, Tracing Test # 10min
+- label: Metrics, Tracing Test # 12min
+ timeout_in_minutes: 20
mirror_hardwares: [amdexperimental]
num_gpus: 2
source_file_dependencies:
- vllm/
- tests/metrics
- - tests/tracing
+ - tests/v1/tracing
commands:
- pytest -v -s metrics
- "pip install \
@@ -209,7 +244,8 @@ steps:
##### fast check tests #####
##### 1 GPU test #####
-- label: Regression Test # 5min
+- label: Regression Test # 7min
+ timeout_in_minutes: 20
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
@@ -219,7 +255,8 @@ steps:
- pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional
-- label: Engine Test # 10min
+- label: Engine Test # 25min
+ timeout_in_minutes: 40
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
@@ -234,7 +271,29 @@ steps:
# OOM in the CI unless we run this separately
- pytest -v -s tokenization
-- label: V1 Test
+- label: V1 Test e2e + engine # 30min
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ commands:
+ # TODO: accuracy does not match, whether setting
+ # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
+ - pytest -v -s v1/e2e
+ - pytest -v -s v1/engine
+
+- label: V1 Test entrypoints # 35min
+ timeout_in_minutes: 50
+ mirror_hardwares: [amdexperimental]
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ commands:
+ - pytest -v -s v1/entrypoints
+
+- label: V1 Test others # 42min
+ timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
@@ -242,8 +301,6 @@ steps:
commands:
# split the test to avoid interference
- pytest -v -s v1/core
- - pytest -v -s v1/engine
- - pytest -v -s v1/entrypoints
- pytest -v -s v1/executor
- pytest -v -s v1/sample
- pytest -v -s v1/logits_processors
@@ -256,14 +313,12 @@ steps:
- pytest -v -s v1/test_utils.py
- pytest -v -s v1/test_oracle.py
- pytest -v -s v1/test_metrics_reader.py
- # TODO: accuracy does not match, whether setting
- # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- - pytest -v -s v1/e2e
# Integration test for streaming correctness (requires special branch).
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
-- label: Examples Test # 25min
+- label: Examples Test # 30min
+ timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/examples"
source_file_dependencies:
@@ -281,14 +336,14 @@ steps:
- python3 offline_inference/vision_language_pooling.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0
- VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- - python3 offline_inference/encoder_decoder.py
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
-- label: Platform Tests (CUDA)
+- label: Platform Tests (CUDA) # 4min
+ timeout_in_minutes: 15
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
@@ -296,7 +351,8 @@ steps:
commands:
- pytest -v -s cuda/test_cuda_context.py
-- label: Samplers Test # 36min
+- label: Samplers Test # 56min
+ timeout_in_minutes: 75
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/model_executor/layers
@@ -307,15 +363,23 @@ steps:
- pytest -v -s samplers
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
-- label: LoRA Test %N # 15min each
+- label: LoRA Test %N # 20min each
+ timeout_in_minutes: 30
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/lora
- tests/lora
- command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
+ commands:
+ - pytest -v -s lora \
+ --shard-id=$$BUILDKITE_PARALLEL_JOB \
+ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+ --ignore=lora/test_chatglm3_tp.py \
+ --ignore=lora/test_llama_tp.py \
+ --ignore=lora/test_llm_with_multi_loras.py
parallelism: 4
-- label: PyTorch Compilation Unit Tests
+- label: PyTorch Compilation Unit Tests # 15min
+ timeout_in_minutes: 30
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
@@ -331,7 +395,8 @@ steps:
- pytest -v -s compile/test_fusion_all_reduce.py
- pytest -v -s compile/test_decorator.py
-- label: PyTorch Fullgraph Smoke Test # 9min
+- label: PyTorch Fullgraph Smoke Test # 15min
+ timeout_in_minutes: 30
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
@@ -339,13 +404,10 @@ steps:
- tests/compile
commands:
- pytest -v -s compile/test_basic_correctness.py
- # these tests need to be separated, cannot combine
- - pytest -v -s compile/piecewise/test_simple.py
- - pytest -v -s compile/piecewise/test_toy_llama.py
- - pytest -v -s compile/piecewise/test_full_cudagraph.py
- - pytest -v -s compile/piecewise/test_multiple_graphs.py
+ - pytest -v -s compile/piecewise/
-- label: PyTorch Fullgraph Test # 18min
+- label: PyTorch Fullgraph Test # 20min
+ timeout_in_minutes: 30
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
@@ -354,7 +416,8 @@ steps:
commands:
- pytest -v -s compile/test_full_graph.py
-- label: Kernels Core Operation Test
+- label: Kernels Core Operation Test # 48min
+ timeout_in_minutes: 75
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/
@@ -362,7 +425,8 @@ steps:
commands:
- pytest -v -s kernels/core
-- label: Kernels Attention Test %N
+- label: Kernels Attention Test %N # 23min
+ timeout_in_minutes: 35
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/attention/
@@ -373,7 +437,8 @@ steps:
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
-- label: Kernels Quantization Test %N
+- label: Kernels Quantization Test %N # 64min
+ timeout_in_minutes: 90
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/quantization/
@@ -383,7 +448,8 @@ steps:
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
-- label: Kernels MoE Test %N
+- label: Kernels MoE Test %N # 40min
+ timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/quantization/cutlass_w8a8/moe/
@@ -395,7 +461,8 @@ steps:
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
-- label: Kernels Mamba Test
+- label: Kernels Mamba Test # 31min
+ timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/mamba/
@@ -403,7 +470,8 @@ steps:
commands:
- pytest -v -s kernels/mamba
-- label: Tensorizer Test # 11min
+- label: Tensorizer Test # 14min
+ timeout_in_minutes: 25
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/model_executor/model_loader
@@ -415,7 +483,8 @@ steps:
- pytest -v -s tensorizer_loader
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
-- label: Model Executor Test
+- label: Model Executor Test # 7min
+ timeout_in_minutes: 20
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/model_executor
@@ -425,7 +494,8 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s model_executor
-- label: Benchmarks # 9min
+- label: Benchmarks # 11min
+ timeout_in_minutes: 20
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/.buildkite"
source_file_dependencies:
@@ -433,7 +503,8 @@ steps:
commands:
- bash scripts/run-benchmarks.sh
-- label: Benchmarks CLI Test # 10min
+- label: Benchmarks CLI Test # 7min
+ timeout_in_minutes: 20
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
@@ -441,7 +512,8 @@ steps:
commands:
- pytest -v -s benchmarks/
-- label: Quantization Test
+- label: Quantization Test # 70min
+ timeout_in_minutes: 90
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/
@@ -449,11 +521,16 @@ steps:
- tests/quantization
commands:
# temporary install here since we need nightly, will move to requirements/test.in
- # after torchao 0.12 release
- - pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
+ # after torchao 0.12 release, and pin a working version of torchao nightly here
+
+ # since torchao nightly is only compatible with torch nightly currently
+ # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
+ # we can only upgrade after this is resolved
+ - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
- label: LM Eval Small Models # 53min
+ timeout_in_minutes: 75
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/
@@ -461,7 +538,8 @@ steps:
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
-- label: OpenAI API correctness
+- label: OpenAI API correctness # 22min
+ timeout_in_minutes: 30
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/
@@ -470,7 +548,8 @@ steps:
commands: # LMEval+Transcription WER check
- pytest -s entrypoints/openai/correctness/
-- label: Encoder Decoder tests # 5min
+- label: Encoder Decoder tests # 12min
+ timeout_in_minutes: 20
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
@@ -478,7 +557,8 @@ steps:
commands:
- pytest -v -s encoder_decoder
-- label: OpenAI-Compatible Tool Use # 20 min
+- label: OpenAI-Compatible Tool Use # 23 min
+ timeout_in_minutes: 35
mirror_hardwares: [amdexperimental]
fast_check: false
source_file_dependencies:
@@ -491,7 +571,8 @@ steps:
##### models test #####
-- label: Basic Models Test # 24min
+- label: Basic Models Test # 57min
+ timeout_in_minutes: 75
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
@@ -504,7 +585,8 @@ steps:
- pytest -v -s models/test_vision.py
- pytest -v -s models/test_initialization.py
-- label: Language Models Test (Standard)
+- label: Language Models Test (Standard) # 35min
+ timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
@@ -515,6 +597,7 @@ steps:
- pytest -v -s models/language -m core_model
- label: Language Models Test (Hybrid) # 35 min
+ timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
@@ -527,7 +610,8 @@ steps:
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
- pytest -v -s models/language/generation -m hybrid_model
-- label: Language Models Test (Extended Generation) # 1hr20min
+- label: Language Models Test (Extended Generation) # 80min
+ timeout_in_minutes: 110
mirror_hardwares: [amdexperimental]
optional: true
source_file_dependencies:
@@ -538,7 +622,18 @@ steps:
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
+- label: Language Models Test (PPL)
+ timeout_in_minutes: 110
+ mirror_hardwares: [amdexperimental]
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/generation_ppl_test
+ commands:
+ - pytest -v -s models/language/generation_ppl_test
+
- label: Language Models Test (Extended Pooling) # 36min
+ timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
optional: true
source_file_dependencies:
@@ -547,16 +642,27 @@ steps:
commands:
- pytest -v -s models/language/pooling -m 'not core_model'
-- label: Multi-Modal Processor Test
+- label: Language Models Test (MTEB)
+ timeout_in_minutes: 110
+ mirror_hardwares: [amdexperimental]
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/pooling_mteb_test
+ commands:
+ - pytest -v -s models/language/pooling_mteb_test
+
+- label: Multi-Modal Processor Test # 44min
+ timeout_in_minutes: 60
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
- - pytest -v -s models/multimodal/processing/test_tensor_schema.py
+ - pytest -v -s models/multimodal/processing
-- label: Multi-Modal Models Test (Standard)
+- label: Multi-Modal Models Test (Standard) # 60min
+ timeout_in_minutes: 80
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
@@ -566,7 +672,7 @@ steps:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pip freeze | grep -E 'torch'
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
+ - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- label: Multi-Modal Models Test (Extended) 1
mirror_hardwares: [amdexperimental]
@@ -598,7 +704,8 @@ steps:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
-- label: Quantized Models Test
+- label: Quantized Models Test # 45 min
+ timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/model_executor/layers/quantization
@@ -628,7 +735,8 @@ steps:
- python3 examples/offline_inference/audio_language.py --model-type whisper
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
-- label: Blackwell Test
+- label: Blackwell Test # 38 min
+ timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
gpu: b200
# optional: true
@@ -650,10 +758,12 @@ steps:
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
- - pytest -v -s tests/kernels/test_cutlass_mla_decode.py
+ - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
+ - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
# Quantization
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
+ - pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
@@ -663,11 +773,13 @@ steps:
- pytest -v -s tests/compile/test_fusion_all_reduce.py
- pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
- pytest -v -s tests/kernels/moe/test_flashinfer.py
+ - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
##### 1 GPU test #####
##### multi gpus test #####
- label: Distributed Comm Ops Test # 7min
+ timeout_in_minutes: 20
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
@@ -679,6 +791,7 @@ steps:
- pytest -v -s distributed/test_shm_broadcast.py
- label: 2 Node Tests (4 GPUs in total) # 16min
+ timeout_in_minutes: 30
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
@@ -702,7 +815,8 @@ steps:
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
-- label: Distributed Tests (2 GPUs) # 40min
+- label: Distributed Tests (2 GPUs) # 110min
+ timeout_in_minutes: 150
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
@@ -733,7 +847,8 @@ steps:
# Avoid importing model tests that cause CUDA reinitialization error
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
- - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
+ - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
+ - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
# test sequence parallel
- pytest -v -s distributed/test_sequence_parallel.py
# this test fails consistently.
@@ -743,6 +858,7 @@ steps:
- pytest -v -s models/multimodal/generation/test_maverick.py
- label: Plugin Tests (2 GPUs) # 40min
+ timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
@@ -755,6 +871,11 @@ steps:
- pytest -v -s plugins_tests/test_platform_plugins.py
- pip uninstall vllm_add_dummy_platform -y
# end platform plugin tests
+ # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
+ - pip install -e ./plugins/prithvi_io_processor_plugin
+ - pytest -v -s plugins_tests/test_io_processor_plugins.py
+ - pip uninstall prithvi_io_processor_plugin -y
+ # end io_processor plugins test
# other tests continue here:
- pytest -v -s plugins_tests/test_scheduler_plugins.py
- pip install -e ./plugins/vllm_add_dummy_model
@@ -763,7 +884,8 @@ steps:
- pytest -v -s models/test_oot_registration.py # it needs a clean process
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
-- label: Pipeline Parallelism Test # 45min
+- label: Pipeline + Context Parallelism Test # 45min
+ timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 4
@@ -776,8 +898,10 @@ steps:
commands:
- pytest -v -s distributed/test_pp_cudagraph.py
- pytest -v -s distributed/test_pipeline_parallel.py
+ # - pytest -v -s distributed/test_context_parallel.py # TODO: enable it on Hopper runners or add triton MLA support
-- label: LoRA TP Test (Distributed)
+- label: LoRA TP Test (Distributed) # 17 min
+ timeout_in_minutes: 30
mirror_hardwares: [amdexperimental]
num_gpus: 4
source_file_dependencies:
@@ -791,13 +915,15 @@ steps:
# requires multi-GPU testing for validation.
- pytest -v -s -x lora/test_chatglm3_tp.py
- pytest -v -s -x lora/test_llama_tp.py
- - pytest -v -s -x lora/test_multi_loras_with_tp.py
+ - pytest -v -s -x lora/test_llm_with_multi_loras.py
- label: Weight Loading Multiple GPU Test # 33min
+ timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
+ optional: true
source_file_dependencies:
- vllm/
- tests/weight_loading
diff --git a/.github/.bc-linter.yml b/.github/.bc-linter.yml
new file mode 100644
index 000000000..443dfa45a
--- /dev/null
+++ b/.github/.bc-linter.yml
@@ -0,0 +1,24 @@
+# doc: https://github.com/pytorch/test-infra/blob/main/tools/stronghold/docs/bc_linter_config.md
+version: 1
+paths:
+# We temporarily disable globally, and will only enable with `annotations.include`
+# include:
+# - "vllm/v1/attetion/*.py"
+# - "vllm/v1/core/*.py"
+exclude:
+ - "**/*.py"
+
+scan:
+ functions: true # check free functions and methods
+ classes: true # check classes/dataclasses
+ public_only: true # ignore names starting with "_" at any level
+
+annotations:
+ include: # decorators that force‑include a symbol
+ - name: "bc_linter_include" # matched by simple name or dotted suffix
+ propagate_to_members: false # for classes, include methods/inner classes
+ exclude: # decorators that force‑exclude a symbol
+ - name: "bc_linter_skip" # matched by simple name or dotted suffix
+ propagate_to_members: true # for classes, exclude methods/inner classes
+
+excluded_violations: [] # e.g. ["ParameterRenamed", "FieldTypeChanged"]
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index c087fd555..846b68054 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -5,18 +5,21 @@
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
+/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
/vllm/model_executor/layers/mamba @tdoublep
-/vllm/multimodal @DarkLight1337 @ywang96
+/vllm/model_executor/model_loader @22quinn
+/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
+/vllm/v1/sample @22quinn @houseroad
/vllm/vllm_flash_attn @LucasWilkinson
/vllm/lora @jeejeelee
-/vllm/reasoning @aarnphm
-/vllm/entrypoints @aarnphm
+/vllm/reasoning @aarnphm @chaunceyjiang
+/vllm/entrypoints @aarnphm @chaunceyjiang
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
+/vllm/distributed/kv_transfer @NickLucche
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
# Any change to the VllmConfig changes can have a large user-facing impact,
@@ -25,8 +28,11 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
# vLLM V1
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
-/vllm/v1/structured_output @mgoin @russellb @aarnphm
+/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
+/vllm/v1/spec_decode @benchislett @luccafong
/vllm/v1/attention/backends/triton_attn.py @tdoublep
+/vllm/v1/core @heheda12345
+/vllm/v1/kv_cache_interface.py @heheda12345
# Test ownership
/.buildkite/lm-eval-harness @mgoin @simon-mo
@@ -34,18 +40,20 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/tests/distributed/test_multi_node_assignment.py @youkaichao
/tests/distributed/test_pipeline_parallel.py @youkaichao
/tests/distributed/test_same_node.py @youkaichao
-/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
+/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
/tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256
/tests/models @DarkLight1337 @ywang96
-/tests/multimodal @DarkLight1337 @ywang96
+/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
/tests/prefix_caching @comaniac @KuntaiDu
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
/tests/test_inputs.py @DarkLight1337 @ywang96
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
/tests/v1/structured_output @mgoin @russellb @aarnphm
+/tests/v1/core @heheda12345
/tests/weight_loading @mgoin @youkaichao @yewentao256
/tests/lora @jeejeelee
/tests/models/language/generation/test_hybrid.py @tdoublep
+/tests/v1/kv_connector/nixl_integration @NickLucche
# Docs
/docs @hmellor
@@ -67,6 +75,9 @@ mkdocs.yaml @hmellor
/vllm/attention/backends/dual_chunk_flash_attn.py @sighingnow
/vllm/model_executor/models/qwen* @sighingnow
+# MTP-specific files
+/vllm/model_executor/models/deepseek_mtp.py @luccafong
+
# Mistral-specific files
/vllm/model_executor/models/mistral*.py @patrickvonplaten
/vllm/model_executor/models/mixtral*.py @patrickvonplaten
@@ -86,3 +97,8 @@ mkdocs.yaml @hmellor
/vllm/attention/ops/rocm*.py @gshtras
/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
+# TPU
+/vllm/v1/worker/tpu* @NickLucche
+/vllm/platforms/tpu.py @NickLucche
+/vllm/v1/sample/tpu @NickLucche
+/vllm/tests/v1/tpu @NickLucche
\ No newline at end of file
diff --git a/.github/mergify.yml b/.github/mergify.yml
index 495d207d4..f2dd2e062 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -124,9 +124,16 @@ pull_request_rules:
- or:
- files~=^examples/.*gpt[-_]?oss.*\.py
- files~=^tests/.*gpt[-_]?oss.*\.py
+ - files~=^tests/entrypoints/openai/test_response_api_with_harmony.py
+ - files~=^tests/entrypoints/test_context.py
- files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
- files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
+ - files~=^vllm/entrypoints/harmony_utils.py
+ - files~=^vllm/entrypoints/tool_server.py
+ - files~=^vllm/entrypoints/tool.py
+ - files~=^vllm/entrypoints/context.py
- title~=(?i)gpt[-_]?oss
+ - title~=(?i)harmony
actions:
label:
add:
@@ -273,6 +280,20 @@ pull_request_rules:
users:
- "sangstar"
+- name: assign reviewer for modelopt changes
+ conditions:
+ - or:
+ - files~=^vllm/model_executor/layers/quantization/modelopt\.py$
+ - files~=^vllm/model_executor/layers/quantization/__init__\.py$
+ - files~=^tests/models/quantization/test_modelopt\.py$
+ - files~=^tests/quantization/test_modelopt\.py$
+ - files~=^tests/models/quantization/test_nvfp4\.py$
+ - files~=^docs/features/quantization/modelopt\.md$
+ actions:
+ assign:
+ users:
+ - "Edwardf0t1"
+
- name: remove 'needs-rebase' label when conflict is resolved
conditions:
- -conflict
diff --git a/.github/scale-config.yml b/.github/scale-config.yml
new file mode 100644
index 000000000..c41a3ee3e
--- /dev/null
+++ b/.github/scale-config.yml
@@ -0,0 +1,21 @@
+# scale-config.yml:
+# Powers what instance types are available for GHA auto-scaled
+# runners. Runners listed here will be available as self hosted
+# runners, configuration is directly pulled from the main branch.
+# runner_types:
+# runner_label:
+# instance_type: m4.large
+# os: linux
+# # min_available defaults to the global cfg in the ALI Terraform
+# min_available: undefined
+# # when max_available value is not defined, no max runners is enforced
+# max_available: undefined
+# disk_size: 50
+# is_ephemeral: true
+
+runner_types:
+ linux.2xlarge:
+ disk_size: 150
+ instance_type: c5.2xlarge
+ is_ephemeral: true
+ os: linux
diff --git a/.github/workflows/add_label_automerge.yml b/.github/workflows/add_label_automerge.yml
index 315042fbf..d8bbedef3 100644
--- a/.github/workflows/add_label_automerge.yml
+++ b/.github/workflows/add_label_automerge.yml
@@ -10,7 +10,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Add label
- uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+ uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
github.rest.issues.addLabels({
diff --git a/.github/workflows/bc-lint.yml b/.github/workflows/bc-lint.yml
new file mode 100644
index 000000000..3795b046d
--- /dev/null
+++ b/.github/workflows/bc-lint.yml
@@ -0,0 +1,27 @@
+name: BC Lint
+
+on:
+ pull_request:
+ types:
+ - opened
+ - synchronize
+ - reopened
+
+jobs:
+ bc_lint:
+ if: github.repository_owner == 'vllm-project'
+ runs-on: ubuntu-latest
+ steps:
+ - name: Run BC Lint Action
+ uses: pytorch/test-infra/.github/actions/bc-lint@main
+ with:
+ repo: ${{ github.event.pull_request.head.repo.full_name }}
+ base_sha: ${{ github.event.pull_request.base.sha }}
+ head_sha: ${{ github.event.pull_request.head.sha }}
+ suppression: ${{ contains(github.event.pull_request.labels.*.name, 'suppress-bc-linter') }}
+ docs_link: 'https://github.com/pytorch/test-infra/wiki/BC-Linter'
+ config_dir: .github
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
+ cancel-in-progress: true
diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml
index d5c6b8d43..c3e132a53 100644
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -16,7 +16,7 @@ jobs:
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python
- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+ uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
with:
python-version: '3.12'
diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml
index 6401d6586..c2b17abe8 100644
--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@@ -13,7 +13,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Label issues based on keywords
- uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+ uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
// Configuration: Add new labels and keywords here
@@ -49,6 +49,10 @@ jobs:
term: "VLLM_ROCM_",
searchIn: "both"
},
+ {
+ term: "aiter",
+ searchIn: "title"
+ },
{
term: "rocm",
searchIn: "title"
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 195579f20..e21d13b81 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -17,7 +17,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+ - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
with:
python-version: "3.12"
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml
index 1ee605dc7..8884359fa 100644
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@@ -9,7 +9,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Remind to run full CI on PR
- uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+ uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
try {
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index 656f3d3fa..82844810a 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -13,7 +13,7 @@ jobs:
actions: write
runs-on: ubuntu-latest
steps:
- - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
+ - uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
with:
# Increasing this value ensures that changes to this workflow
# propagate to all issues and PRs in days rather than months
diff --git a/.gitignore b/.gitignore
index 465935d48..b1df673e8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,7 +4,7 @@
# vllm-flash-attn built from source
vllm/vllm_flash_attn/*
-# triton jit
+# triton jit
.triton
# Byte-compiled / optimized / DLL files
@@ -177,6 +177,14 @@ cython_debug/
# VSCode
.vscode/
+# Claude
+CLAUDE.md
+.claude/
+
+# Codex
+AGENTS.md
+.codex/
+
# DS Store
.DS_Store
@@ -209,4 +217,4 @@ shellcheck*/
csrc/moe/marlin_moe_wna16/kernel_*
# Ignore ep_kernels_workspace folder
-ep_kernels_workspace/
\ No newline at end of file
+ep_kernels_workspace/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 612b290e8..c16bdeeec 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -21,7 +21,7 @@ repos:
- id: ruff-format
files: ^(.buildkite|benchmarks|examples)/.*
- repo: https://github.com/crate-ci/typos
- rev: v1.34.0
+ rev: v1.35.5
hooks:
- id: typos
- repo: https://github.com/PyCQA/isort
diff --git a/.yapfignore b/.yapfignore
index 2d6dcf838..381582590 100644
--- a/.yapfignore
+++ b/.yapfignore
@@ -1 +1,2 @@
collect_env.py
+vllm/model_executor/layers/fla/ops/*.py
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fb645b183..1e58ebb55 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -45,8 +45,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
# requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from docker/Dockerfile.rocm
#
-set(TORCH_SUPPORTED_VERSION_CUDA "2.7.1")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")
#
# Try to find python package with an executable that exactly matches
@@ -542,6 +542,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
set(SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
+ "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
@@ -560,6 +561,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
set(SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
+ "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
"csrc/quantization/fp4/nvfp4_experts_quant.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
diff --git a/MANIFEST.in b/MANIFEST.in
index 82fd22b84..fb3cccbb4 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,7 +2,6 @@ include LICENSE
include requirements/common.txt
include requirements/cuda.txt
include requirements/rocm.txt
-include requirements/neuron.txt
include requirements/cpu.txt
include CMakeLists.txt
diff --git a/README.md b/README.md
index 8812aac4e..b4a3583c2 100644
--- a/README.md
+++ b/README.md
@@ -14,19 +14,24 @@ Easy, fast, and cheap LLM serving for everyone
| Documentation | Blog | Paper | Twitter/X | User Forum | Developer Slack |
+---
+Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundation.org/pytorch-conference/) and [Ray Summit, November 3-5](https://www.anyscale.com/ray-summit/2025) in San Francisco for our latest updates on vLLM and to meet the vLLM team! Register now for the largest vLLM community events of the year!
+
---
*Latest News* 🔥
+- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
+- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
- [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
-- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
-- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
- [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
Previous News
+- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
+- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 38072152b..ee1726420 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -95,6 +95,24 @@ become available.
✅ |
lmms-lab/LLaVA-OneVision-Data, Aeala/ShareGPT_Vicuna_unfiltered |
+
+ | HuggingFace-MTBench |
+ ✅ |
+ ✅ |
+ philschmid/mt-bench |
+
+
+ | HuggingFace-Blazedit |
+ ✅ |
+ ✅ |
+ vdaita/edit_5k_char, vdaita/edit_10k_char |
+
+
+ | Spec Bench |
+ ✅ |
+ ✅ |
+ wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl |
+
| Custom |
✅ |
@@ -110,7 +128,12 @@ become available.
🚧: to be supported
-**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
+**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`.
+For local `dataset-path`, please set `hf-name` to its Hugging Face ID like
+
+```bash
+--dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat
+```
## 🚀 Example - Online Benchmark
@@ -234,6 +257,43 @@ vllm bench serve \
--num-prompts 2048
```
+### Spec Bench Benchmark with Speculative Decoding
+
+``` bash
+VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
+ --speculative-config $'{"method": "ngram",
+ "num_speculative_tokens": 5, "prompt_lookup_max": 5,
+ "prompt_lookup_min": 2}'
+```
+
+[SpecBench dataset](https://github.com/hemingkx/Spec-Bench)
+
+Run all categories:
+
+``` bash
+# Download the dataset using:
+# wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl
+
+vllm bench serve \
+ --model meta-llama/Meta-Llama-3-8B-Instruct \
+ --dataset-name spec_bench \
+ --dataset-path "/data/spec_bench/question.jsonl" \
+ --num-prompts -1
+```
+
+Available categories include `[writing, roleplay, reasoning, math, coding, extraction, stem, humanities, translation, summarization, qa, math_reasoning, rag]`.
+
+Run only a specific category like "summarization":
+
+``` bash
+vllm bench serve \
+ --model meta-llama/Meta-Llama-3-8B-Instruct \
+ --dataset-name spec_bench \
+ --dataset-path "/data/spec_bench/question.jsonl" \
+ --num-prompts -1
+ --spec-bench-category "summarization"
+```
+
### Other HuggingFaceDataset Examples
```bash
@@ -290,6 +350,18 @@ vllm bench serve \
--num-prompts 80
```
+`vdaita/edit_5k_char` or `vdaita/edit_10k_char`:
+
+``` bash
+vllm bench serve \
+ --model Qwen/QwQ-32B \
+ --dataset-name hf \
+ --dataset-path vdaita/edit_5k_char \
+ --num-prompts 90 \
+ --blazedit-min-distance 0.01 \
+ --blazedit-max-distance 0.99
+```
+
### Running With Sampling Parameters
When using OpenAI-compatible backends such as `vllm`, optional sampling
@@ -689,7 +761,7 @@ python -m vllm.entrypoints.openai.api_server \
Send requests with images:
```bash
-python benchmarks/benchmark_serving.py \
+vllm bench serve \
--backend openai-chat \
--model Qwen/Qwen2.5-VL-7B-Instruct \
--dataset-name sharegpt \
@@ -716,7 +788,7 @@ python -m vllm.entrypoints.openai.api_server \
Send requests with videos:
```bash
-python benchmarks/benchmark_serving.py \
+vllm bench serve \
--backend openai-chat \
--model Qwen/Qwen2.5-VL-7B-Instruct \
--dataset-name sharegpt \
diff --git a/benchmarks/auto_tune/README.md b/benchmarks/auto_tune/README.md
index 9aad51df6..3aa988aac 100644
--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@@ -31,6 +31,12 @@ cd vllm
You must set the following variables at the top of the script before execution.
+ Note: You can also override the default values below via environment variables when running the script.
+
+```bash
+MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LEN=128 OUTPUT_LEN=2048 MAX_MODEL_LEN=2300 MIN_CACHE_HIT_PCT=0 MAX_LATENCY_ALLOWED_MS=100000000000 NUM_SEQS_LIST="128 256" NUM_BATCHED_TOKENS_LIST="1024 2048 4096" VLLM_LOGGING_LEVEL=DEBUG bash auto_tune.sh
+```
+
| Variable | Description | Example Value |
| --- | --- | --- |
| `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |
diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh
index 82c20ffa6..ed3679b66 100644
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@@ -5,25 +5,41 @@
TAG=$(date +"%Y_%m_%d_%H_%M")
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-BASE="$SCRIPT_DIR/../../.."
-MODEL="meta-llama/Llama-3.1-8B-Instruct"
-SYSTEM="TPU"
-TP=1
-DOWNLOAD_DIR=""
-INPUT_LEN=4000
-OUTPUT_LEN=16
-MAX_MODEL_LEN=4096
-MIN_CACHE_HIT_PCT=0
-MAX_LATENCY_ALLOWED_MS=100000000000
-NUM_SEQS_LIST="128 256"
-NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
+VLLM_LOGGING_LEVEL=${VLLM_LOGGING_LEVEL:-INFO}
+BASE=${BASE:-"$SCRIPT_DIR/../../.."}
+MODEL=${MODEL:-"meta-llama/Llama-3.1-8B-Instruct"}
+SYSTEM=${SYSTEM:-"TPU"}
+TP=${TP:-1}
+DOWNLOAD_DIR=${DOWNLOAD_DIR:-""}
+INPUT_LEN=${INPUT_LEN:-4000}
+OUTPUT_LEN=${OUTPUT_LEN:-16}
+MAX_MODEL_LEN=${MAX_MODEL_LEN:-4096}
+MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
+MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
+NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
+NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
RESULT="$LOG_FOLDER/result.txt"
PROFILE_PATH="$LOG_FOLDER/profile"
-echo "result file: $RESULT"
-echo "model: $MODEL"
+echo "====================== AUTO TUNE PARAMETERS ===================="
+echo "SCRIPT_DIR=$SCRIPT_DIR"
+echo "BASE=$BASE"
+echo "MODEL=$MODEL"
+echo "SYSTEM=$SYSTEM"
+echo "TP=$TP"
+echo "DOWNLOAD_DIR=$DOWNLOAD_DIR"
+echo "INPUT_LEN=$INPUT_LEN"
+echo "OUTPUT_LEN=$OUTPUT_LEN"
+echo "MAX_MODEL_LEN=$MAX_MODEL_LEN"
+echo "MIN_CACHE_HIT_PCT=$MIN_CACHE_HIT_PCT"
+echo "MAX_LATENCY_ALLOWED_MS=$MAX_LATENCY_ALLOWED_MS"
+echo "NUM_SEQS_LIST=$NUM_SEQS_LIST"
+echo "NUM_BATCHED_TOKENS_LIST=$NUM_BATCHED_TOKENS_LIST"
+echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL"
+echo "RESULT_FILE=$RESULT"
+echo "====================== AUTO TUNEPARAMETERS ===================="
rm -rf $LOG_FOLDER
rm -rf $PROFILE_PATH
@@ -213,7 +229,7 @@ run_benchmark() {
pkill -if vllm
sleep 10
- printf '=%.0s' $(seq 1 20)
+ echo "===================="
return 0
}
diff --git a/benchmarks/benchmark_block_pool.py b/benchmarks/benchmark_block_pool.py
index fd363c2ad..eae8d9927 100644
--- a/benchmarks/benchmark_block_pool.py
+++ b/benchmarks/benchmark_block_pool.py
@@ -57,7 +57,7 @@ def invoke_main() -> None:
"--num-iteration",
type=int,
default=1000,
- help="Number of iterations to run to stablize final data readings",
+ help="Number of iterations to run to stabilize final data readings",
)
parser.add_argument(
"--allocate-blocks",
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index 2ea4f9cca..64ffa62c0 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -403,7 +403,7 @@ class RandomDataset(BenchmarkDataset):
# [6880, 6881] -> ['Ġcalls', 'here'] ->
# [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
# To avoid uncontrolled change of the prompt length,
- # the encoded sequence is truncated before being decode again.
+ # the encoded sequence is truncated before being decoded again.
total_input_len = prefix_len + int(input_lens[i])
re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[
:total_input_len
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index d8b960eda..a7892f3f7 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -1,191 +1,17 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Benchmark the latency of processing a single batch of requests."""
-
-import argparse
-import dataclasses
-import json
-import os
-import time
-from typing import Any, Optional
-
-import numpy as np
-from tqdm import tqdm
-from typing_extensions import deprecated
-
-import vllm.envs as envs
-from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
-from vllm import LLM, SamplingParams
-from vllm.engine.arg_utils import EngineArgs
-from vllm.inputs import PromptType
-from vllm.sampling_params import BeamSearchParams
-from vllm.utils import FlexibleArgumentParser
-
-
-def save_to_pytorch_benchmark_format(
- args: argparse.Namespace, results: dict[str, Any]
-) -> None:
- pt_records = convert_to_pytorch_benchmark_format(
- args=args,
- metrics={"latency": results["latencies"]},
- extra_info={k: results[k] for k in ["avg_latency", "percentiles"]},
- )
- if pt_records:
- pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
- write_to_json(pt_file, pt_records)
-
-
-@deprecated(
- "benchmark_latency.py is deprecated and will be removed in a "
- "future version. Please use 'vllm bench latency' instead.",
-)
-def main(args: argparse.Namespace):
- print(args)
-
- engine_args = EngineArgs.from_cli_args(args)
-
- # NOTE(woosuk): If the request cannot be processed in a single batch,
- # the engine will automatically process the request in multiple batches.
- llm = LLM(**dataclasses.asdict(engine_args))
- assert llm.llm_engine.model_config.max_model_len >= (
- args.input_len + args.output_len
- ), (
- "Please ensure that max_model_len is greater than"
- " the sum of input_len and output_len."
- )
-
- sampling_params = SamplingParams(
- n=args.n,
- temperature=1.0,
- top_p=1.0,
- ignore_eos=True,
- max_tokens=args.output_len,
- detokenize=not args.disable_detokenize,
- )
- print(sampling_params)
- dummy_prompt_token_ids = np.random.randint(
- 10000, size=(args.batch_size, args.input_len)
- )
- dummy_prompts: list[PromptType] = [
- {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
- ]
-
- def llm_generate():
- if not args.use_beam_search:
- llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
- else:
- llm.beam_search(
- dummy_prompts,
- BeamSearchParams(
- beam_width=args.n,
- max_tokens=args.output_len,
- ignore_eos=True,
- ),
- )
-
- def run_to_completion(profile_dir: Optional[str] = None):
- if profile_dir:
- llm.start_profile()
- llm_generate()
- llm.stop_profile()
- else:
- start_time = time.perf_counter()
- llm_generate()
- end_time = time.perf_counter()
- latency = end_time - start_time
- return latency
-
- print("Warming up...")
- for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
- run_to_completion(profile_dir=None)
-
- if args.profile:
- profile_dir = envs.VLLM_TORCH_PROFILER_DIR
- print(f"Profiling (results will be saved to '{profile_dir}')...")
- run_to_completion(profile_dir=profile_dir)
- return
-
- # Benchmark.
- latencies = []
- for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
- latencies.append(run_to_completion(profile_dir=None))
- latencies = np.array(latencies)
- percentages = [10, 25, 50, 75, 90, 99]
- percentiles = np.percentile(latencies, percentages)
- print(f"Avg latency: {np.mean(latencies)} seconds")
- for percentage, percentile in zip(percentages, percentiles):
- print(f"{percentage}% percentile latency: {percentile} seconds")
-
- # Output JSON results if specified
- if args.output_json:
- results = {
- "avg_latency": np.mean(latencies),
- "latencies": latencies.tolist(),
- "percentiles": dict(zip(percentages, percentiles.tolist())),
- }
- with open(args.output_json, "w") as f:
- json.dump(results, f, indent=4)
- save_to_pytorch_benchmark_format(args, results)
-
-
-def create_argument_parser():
- parser = FlexibleArgumentParser(
- description="Benchmark the latency of processing a single batch of "
- "requests till completion."
- )
- parser.add_argument("--input-len", type=int, default=32)
- parser.add_argument("--output-len", type=int, default=128)
- parser.add_argument("--batch-size", type=int, default=8)
- parser.add_argument(
- "--n",
- type=int,
- default=1,
- help="Number of generated sequences per prompt.",
- )
- parser.add_argument("--use-beam-search", action="store_true")
- parser.add_argument(
- "--num-iters-warmup",
- type=int,
- default=10,
- help="Number of iterations to run for warmup.",
- )
- parser.add_argument(
- "--num-iters", type=int, default=30, help="Number of iterations to run."
- )
- parser.add_argument(
- "--profile",
- action="store_true",
- help="profile the generation process of a single batch",
- )
- parser.add_argument(
- "--output-json",
- type=str,
- default=None,
- help="Path to save the latency results in JSON format.",
- )
- parser.add_argument(
- "--disable-detokenize",
- action="store_true",
- help=(
- "Do not detokenize responses (i.e. do not include "
- "detokenization time in the latency measurement)"
- ),
- )
-
- parser = EngineArgs.add_cli_args(parser)
- # V1 enables prefix caching by default which skews the latency
- # numbers. We need to disable prefix caching by default.
- parser.set_defaults(enable_prefix_caching=False)
-
- return parser
-
+import sys
if __name__ == "__main__":
- parser = create_argument_parser()
- args = parser.parse_args()
- if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
- raise OSError(
- "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
- "Please set it to a valid path to use torch profiler."
- )
- main(args)
+ print("""DEPRECATED: This script has been moved to the vLLM CLI.
+
+Please use the following command instead:
+ vllm bench latency
+
+For help with the new command, run:
+ vllm bench latency --help
+
+Alternatively, you can run the new command directly with:
+ python -m vllm.entrypoints.cli.main bench latency --help
+""")
+ sys.exit(1)
diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py
index c60040d05..11833fa1b 100644
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@@ -77,7 +77,7 @@ def invoke_main() -> None:
"--num-iteration",
type=int,
default=100,
- help="Number of iterations to run to stablize final data readings",
+ help="Number of iterations to run to stabilize final data readings",
)
parser.add_argument(
"--num-req", type=int, default=128, help="Number of requests in the batch"
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 02f5f585c..76cf51498 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -1,1324 +1,17 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-r"""Benchmark online serving throughput.
-
-On the server side, run one of the following commands:
- vLLM OpenAI API server
- vllm serve \
- --swap-space 16
-
-On the client side, run:
- python benchmarks/benchmark_serving.py \
- --backend \
- --model \
- --dataset-name sharegpt \
- --dataset-path \
- --request-rate \ # By default is inf
- --num-prompts # By default is 1000
-
- when using tgi backend, add
- --endpoint /generate_stream
- to the end of the command above.
-"""
-
-import argparse
-import asyncio
-import gc
-import json
-import os
-import random
-import time
-import warnings
-from collections.abc import Iterable
-from dataclasses import dataclass
-from datetime import datetime
-from typing import Any, Literal, Optional
-
-import numpy as np
-from tqdm.asyncio import tqdm
-from transformers import PreTrainedTokenizerBase
-from typing_extensions import deprecated
-
-from backend_request_func import (
- ASYNC_REQUEST_FUNCS,
- OPENAI_COMPATIBLE_BACKENDS,
- RequestFuncInput,
- RequestFuncOutput,
-)
-
-try:
- from vllm.transformers_utils.tokenizer import get_tokenizer
-except ImportError:
- from backend_request_func import get_tokenizer
-
-try:
- from vllm.utils import FlexibleArgumentParser
-except ImportError:
- from argparse import ArgumentParser as FlexibleArgumentParser
-
-from benchmark_dataset import (
- AIMODataset,
- ASRDataset,
- BurstGPTDataset,
- ConversationDataset,
- CustomDataset,
- HuggingFaceDataset,
- InstructCoderDataset,
- MTBenchDataset,
- NextEditPredictionDataset,
- RandomDataset,
- SampleRequest,
- ShareGPTDataset,
- SonnetDataset,
- VisionArenaDataset,
-)
-from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
-from vllm.benchmarks.serve import get_request
-
-MILLISECONDS_TO_SECONDS_CONVERSION = 1000
-
-
-@dataclass
-class BenchmarkMetrics:
- completed: int
- total_input: int
- total_output: int
- request_throughput: float
- request_goodput: float
- output_throughput: float
- total_token_throughput: float
- mean_ttft_ms: float
- median_ttft_ms: float
- std_ttft_ms: float
- percentiles_ttft_ms: list[tuple[float, float]]
- mean_tpot_ms: float
- median_tpot_ms: float
- std_tpot_ms: float
- percentiles_tpot_ms: list[tuple[float, float]]
- mean_itl_ms: float
- median_itl_ms: float
- std_itl_ms: float
- percentiles_itl_ms: list[tuple[float, float]]
- # E2EL stands for end-to-end latency per request.
- # It is the time taken on the client side from sending
- # a request to receiving a complete response.
- mean_e2el_ms: float
- median_e2el_ms: float
- std_e2el_ms: float
- percentiles_e2el_ms: list[tuple[float, float]]
-
-
-def calculate_metrics(
- input_requests: list[SampleRequest],
- outputs: list[RequestFuncOutput],
- dur_s: float,
- tokenizer: PreTrainedTokenizerBase,
- selected_percentile_metrics: list[str],
- selected_percentiles: list[float],
- goodput_config_dict: dict[str, float],
-) -> tuple[BenchmarkMetrics, list[int]]:
- actual_output_lens: list[int] = []
- total_input = 0
- completed = 0
- good_completed = 0
- itls: list[float] = []
- tpots: list[float] = []
- all_tpots: list[float] = []
- ttfts: list[float] = []
- e2els: list[float] = []
- for i in range(len(outputs)):
- if outputs[i].success:
- output_len = outputs[i].output_tokens
-
- if not output_len:
- # We use the tokenizer to count the number of output tokens
- # for some serving backends instead of looking at
- # len(outputs[i].itl) since multiple output tokens may be
- # bundled together
- # Note : this may inflate the output token count slightly
- output_len = len(
- tokenizer(
- outputs[i].generated_text, add_special_tokens=False
- ).input_ids
- )
- actual_output_lens.append(output_len)
- total_input += input_requests[i].prompt_len
- tpot = 0
- if output_len > 1:
- latency_minus_ttft = outputs[i].latency - outputs[i].ttft
- tpot = latency_minus_ttft / (output_len - 1)
- tpots.append(tpot)
- # Note: if output_len <= 1, we regard tpot as 0 for goodput
- all_tpots.append(tpot)
- itls += outputs[i].itl
- ttfts.append(outputs[i].ttft)
- e2els.append(outputs[i].latency)
- completed += 1
- else:
- actual_output_lens.append(0)
-
- if goodput_config_dict:
- valid_metrics = []
- slo_values = []
-
- if "ttft" in goodput_config_dict:
- valid_metrics.append(ttfts)
- slo_values.append(
- goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION
- )
- if "tpot" in goodput_config_dict:
- valid_metrics.append(all_tpots)
- slo_values.append(
- goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION
- )
- if "e2el" in goodput_config_dict:
- valid_metrics.append(e2els)
- slo_values.append(
- goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION
- )
-
- for req_metric in zip(*valid_metrics):
- is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
- if is_good_req:
- good_completed += 1
-
- if completed == 0:
- warnings.warn(
- "All requests failed. This is likely due to a misconfiguration "
- "on the benchmark arguments.",
- stacklevel=2,
- )
- metrics = BenchmarkMetrics(
- completed=completed,
- total_input=total_input,
- total_output=sum(actual_output_lens),
- request_throughput=completed / dur_s,
- request_goodput=good_completed / dur_s,
- output_throughput=sum(actual_output_lens) / dur_s,
- total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
- mean_ttft_ms=np.mean(ttfts or 0)
- * 1000, # ttfts is empty if streaming is not supported by backend
- std_ttft_ms=np.std(ttfts or 0) * 1000,
- median_ttft_ms=np.median(ttfts or 0) * 1000,
- percentiles_ttft_ms=[
- (p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles
- ],
- mean_tpot_ms=np.mean(tpots or 0) * 1000,
- std_tpot_ms=np.std(tpots or 0) * 1000,
- median_tpot_ms=np.median(tpots or 0) * 1000,
- percentiles_tpot_ms=[
- (p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles
- ],
- mean_itl_ms=np.mean(itls or 0) * 1000,
- std_itl_ms=np.std(itls or 0) * 1000,
- median_itl_ms=np.median(itls or 0) * 1000,
- percentiles_itl_ms=[
- (p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles
- ],
- mean_e2el_ms=np.mean(e2els or 0) * 1000,
- std_e2el_ms=np.std(e2els or 0) * 1000,
- median_e2el_ms=np.median(e2els or 0) * 1000,
- percentiles_e2el_ms=[
- (p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles
- ],
- )
-
- return metrics, actual_output_lens
-
-
-async def benchmark(
- backend: str,
- api_url: str,
- base_url: str,
- model_id: str,
- model_name: str,
- tokenizer: PreTrainedTokenizerBase,
- input_requests: list[SampleRequest],
- logprobs: Optional[int],
- request_rate: float,
- burstiness: float,
- disable_tqdm: bool,
- profile: bool,
- selected_percentile_metrics: list[str],
- selected_percentiles: list[float],
- ignore_eos: bool,
- goodput_config_dict: dict[str, float],
- max_concurrency: Optional[int],
- lora_modules: Optional[Iterable[str]],
- extra_body: Optional[dict],
- ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
- ramp_up_start_rps: Optional[int] = None,
- ramp_up_end_rps: Optional[int] = None,
-):
- if backend in ASYNC_REQUEST_FUNCS:
- request_func = ASYNC_REQUEST_FUNCS[backend]
- else:
- raise ValueError(f"Unknown backend: {backend}")
-
- print("Starting initial single prompt test run...")
- test_prompt, test_prompt_len, test_output_len, test_mm_content = (
- input_requests[0].prompt,
- input_requests[0].prompt_len,
- input_requests[0].expected_output_len,
- input_requests[0].multi_modal_data,
- )
-
- assert (
- test_mm_content is None
- or isinstance(test_mm_content, dict)
- or (
- isinstance(test_mm_content, list)
- and all(isinstance(item, dict) for item in test_mm_content)
- )
- ), "multi_modal_data must be a dict or list[dict]"
- test_input = RequestFuncInput(
- model=model_id,
- model_name=model_name,
- prompt=test_prompt,
- api_url=api_url,
- prompt_len=test_prompt_len,
- output_len=test_output_len,
- logprobs=logprobs,
- multi_modal_content=test_mm_content,
- ignore_eos=ignore_eos,
- extra_body=extra_body,
- )
-
- test_output = await request_func(request_func_input=test_input)
- if not test_output.success:
- raise ValueError(
- "Initial test run failed - Please make sure benchmark arguments "
- f"are correctly specified. Error: {test_output.error}"
- )
- else:
- print("Initial test run completed. Starting main benchmark run...")
-
- if lora_modules:
- # For each input request, choose a LoRA module at random.
- lora_modules = iter(
- [random.choice(lora_modules) for _ in range(len(input_requests))]
- )
-
- if profile:
- print("Starting profiler...")
- profile_input = RequestFuncInput(
- model=model_id,
- model_name=model_name,
- prompt=test_prompt,
- api_url=base_url + "/start_profile",
- prompt_len=test_prompt_len,
- output_len=test_output_len,
- logprobs=logprobs,
- multi_modal_content=test_mm_content,
- ignore_eos=ignore_eos,
- extra_body=extra_body,
- )
- profile_output = await request_func(request_func_input=profile_input)
- if profile_output.success:
- print("Profiler started")
-
- distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution"
-
- if ramp_up_strategy is not None:
- print(
- f"Traffic ramp-up strategy: {ramp_up_strategy}. Will increase "
- f"RPS from {ramp_up_start_rps} to {ramp_up_end_rps} RPS over "
- "the duration of the benchmark."
- )
- else:
- print(f"Traffic request rate: {request_rate} RPS.")
-
- print(f"Burstiness factor: {burstiness} ({distribution})")
- print(f"Maximum request concurrency: {max_concurrency}")
-
- pbar = None if disable_tqdm else tqdm(total=len(input_requests))
-
- # This can be used once the minimum Python version is 3.10 or higher,
- # and it will simplify the code in limited_request_func.
- # semaphore = (asyncio.Semaphore(max_concurrency)
- # if max_concurrency else contextlib.nullcontext())
- semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
-
- async def limited_request_func(request_func_input, pbar):
- if semaphore is None:
- return await request_func(request_func_input=request_func_input, pbar=pbar)
- async with semaphore:
- return await request_func(request_func_input=request_func_input, pbar=pbar)
-
- benchmark_start_time = time.perf_counter()
- tasks: list[asyncio.Task] = []
-
- rps_change_events = []
- last_int_rps = -1
- if ramp_up_strategy is not None and ramp_up_start_rps is not None:
- last_int_rps = ramp_up_start_rps
- rps_change_events.append(
- {
- "rps": last_int_rps,
- "timestamp": datetime.now().isoformat(),
- }
- )
-
- async for request, current_request_rate in get_request(
- input_requests,
- request_rate,
- burstiness,
- ramp_up_strategy,
- ramp_up_start_rps,
- ramp_up_end_rps,
- ):
- if ramp_up_strategy is not None:
- current_int_rps = int(current_request_rate)
- if current_int_rps > last_int_rps:
- timestamp = datetime.now().isoformat()
- for rps_val in range(last_int_rps + 1, current_int_rps + 1):
- rps_change_events.append({"rps": rps_val, "timestamp": timestamp})
- last_int_rps = current_int_rps
-
- prompt, prompt_len, output_len, mm_content, request_id = (
- request.prompt,
- request.prompt_len,
- request.expected_output_len,
- request.multi_modal_data,
- request.request_id,
- )
- req_model_id, req_model_name = model_id, model_name
- if lora_modules:
- req_lora_module = next(lora_modules)
- req_model_id, req_model_name = req_lora_module, req_lora_module
-
- request_func_input = RequestFuncInput(
- model=req_model_id,
- model_name=req_model_name,
- prompt=prompt,
- api_url=api_url,
- prompt_len=prompt_len,
- output_len=output_len,
- logprobs=logprobs,
- multi_modal_content=mm_content,
- ignore_eos=ignore_eos,
- extra_body=extra_body,
- request_id=request_id,
- )
- task = limited_request_func(request_func_input=request_func_input, pbar=pbar)
- tasks.append(asyncio.create_task(task))
- outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
-
- if pbar is not None:
- pbar.close()
-
- benchmark_duration = time.perf_counter() - benchmark_start_time
-
- metrics, actual_output_lens = calculate_metrics(
- input_requests=input_requests,
- outputs=outputs,
- dur_s=benchmark_duration,
- tokenizer=tokenizer,
- selected_percentile_metrics=selected_percentile_metrics,
- selected_percentiles=selected_percentiles,
- goodput_config_dict=goodput_config_dict,
- )
-
- print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
- print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
- if max_concurrency is not None:
- print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency))
- if request_rate != float("inf"):
- print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate))
- print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
- print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
- print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
- print(
- "{:<40} {:<10.2f}".format(
- "Request throughput (req/s):", metrics.request_throughput
- )
- )
- if goodput_config_dict:
- print(
- "{:<40} {:<10.2f}".format(
- "Request goodput (req/s):", metrics.request_goodput
- )
- )
- print(
- "{:<40} {:<10.2f}".format(
- "Output token throughput (tok/s):", metrics.output_throughput
- )
- )
- print(
- "{:<40} {:<10.2f}".format(
- "Total Token throughput (tok/s):", metrics.total_token_throughput
- )
- )
-
- result = {
- "duration": benchmark_duration,
- "completed": metrics.completed,
- "total_input_tokens": metrics.total_input,
- "total_output_tokens": metrics.total_output,
- "request_throughput": metrics.request_throughput,
- "request_goodput": metrics.request_goodput if goodput_config_dict else None,
- "output_throughput": metrics.output_throughput,
- "total_token_throughput": metrics.total_token_throughput,
- "input_lens": [output.prompt_len for output in outputs],
- "output_lens": actual_output_lens,
- "ttfts": [output.ttft for output in outputs],
- "itls": [output.itl for output in outputs],
- "generated_texts": [output.generated_text for output in outputs],
- "errors": [output.error for output in outputs],
- }
-
- if rps_change_events:
- result["rps_change_events"] = rps_change_events
-
- def process_one_metric(
- # E.g., "ttft"
- metric_attribute_name: str,
- # E.g., "TTFT"
- metric_name: str,
- # E.g., "Time to First Token"
- metric_header: str,
- ):
- # This function prints and adds statistics of the specified
- # metric.
- if metric_attribute_name not in selected_percentile_metrics:
- return
- print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
- print(
- "{:<40} {:<10.2f}".format(
- f"Mean {metric_name} (ms):",
- getattr(metrics, f"mean_{metric_attribute_name}_ms"),
- )
- )
- print(
- "{:<40} {:<10.2f}".format(
- f"Median {metric_name} (ms):",
- getattr(metrics, f"median_{metric_attribute_name}_ms"),
- )
- )
- result[f"mean_{metric_attribute_name}_ms"] = getattr(
- metrics, f"mean_{metric_attribute_name}_ms"
- )
- result[f"median_{metric_attribute_name}_ms"] = getattr(
- metrics, f"median_{metric_attribute_name}_ms"
- )
- result[f"std_{metric_attribute_name}_ms"] = getattr(
- metrics, f"std_{metric_attribute_name}_ms"
- )
- for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"):
- p_word = str(int(p)) if int(p) == p else str(p)
- print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value))
- result[f"p{p_word}_{metric_attribute_name}_ms"] = value
-
- process_one_metric("ttft", "TTFT", "Time to First Token")
- process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)")
- process_one_metric("itl", "ITL", "Inter-token Latency")
- process_one_metric("e2el", "E2EL", "End-to-end Latency")
-
- print("=" * 50)
-
- if profile:
- print("Stopping profiler...")
- profile_input = RequestFuncInput(
- model=model_id,
- prompt=test_prompt,
- api_url=base_url + "/stop_profile",
- prompt_len=test_prompt_len,
- output_len=test_output_len,
- logprobs=logprobs,
- )
- profile_output = await request_func(request_func_input=profile_input)
- if profile_output.success:
- print("Profiler stopped")
-
- return result
-
-
-def check_goodput_args(args):
- # Check and parse goodput arguments
- goodput_config_dict = {}
- VALID_NAMES = ["ttft", "tpot", "e2el"]
- if args.goodput:
- goodput_config_dict = parse_goodput(args.goodput)
- for slo_name, slo_val in goodput_config_dict.items():
- if slo_name not in VALID_NAMES:
- raise ValueError(
- f"Invalid metric name found, {slo_name}: {slo_val}. "
- "The service level objective name should be one of "
- f"{str(VALID_NAMES)}. "
- )
- if slo_val < 0:
- raise ValueError(
- f"Invalid value found, {slo_name}: {slo_val}. "
- "The service level objective value should be "
- "non-negative."
- )
- return goodput_config_dict
-
-
-def parse_goodput(slo_pairs):
- goodput_config_dict = {}
- try:
- for slo_pair in slo_pairs:
- slo_name, slo_val = slo_pair.split(":")
- goodput_config_dict[slo_name] = float(slo_val)
- except ValueError as err:
- raise argparse.ArgumentTypeError(
- "Invalid format found for service level objectives. "
- 'Specify service level objectives for goodput as "KEY:VALUE" '
- "pairs, where the key is a metric name, and the value is a "
- "number in milliseconds."
- ) from err
- return goodput_config_dict
-
-
-def save_to_pytorch_benchmark_format(
- args: argparse.Namespace, results: dict[str, Any], file_name: str
-) -> None:
- metrics = [
- "median_ttft_ms",
- "mean_ttft_ms",
- "std_ttft_ms",
- "p99_ttft_ms",
- "mean_tpot_ms",
- "median_tpot_ms",
- "std_tpot_ms",
- "p99_tpot_ms",
- "median_itl_ms",
- "mean_itl_ms",
- "std_itl_ms",
- "p99_itl_ms",
- ]
- # These raw data might be useful, but they are rather big. They can be added
- # later if needed
- ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"]
- pt_records = convert_to_pytorch_benchmark_format(
- args=args,
- metrics={k: [results[k]] for k in metrics},
- extra_info={
- k: results[k]
- for k in results
- if k not in metrics and k not in ignored_metrics
- },
- )
- if pt_records:
- # Don't use json suffix here as we don't want CI to pick it up
- pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json"
- write_to_json(pt_file, pt_records)
-
-
-@deprecated(
- "benchmark_serving.py is deprecated and will be removed in a future "
- "version. Please use 'vllm bench serve' instead.",
-)
-def main(args: argparse.Namespace):
- print(args)
- random.seed(args.seed)
- np.random.seed(args.seed)
-
- backend = args.backend
- model_id = args.model
- model_name = args.served_model_name
- tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
- tokenizer_mode = args.tokenizer_mode
-
- # Validate ramp-up arguments
- if args.ramp_up_strategy is not None:
- if args.request_rate != float("inf"):
- raise ValueError(
- "When using ramp-up, do not specify --request-rate. "
- "The request rate will be controlled by ramp-up parameters. "
- "Please remove the --request-rate argument."
- )
- if args.ramp_up_start_rps is None or args.ramp_up_end_rps is None:
- raise ValueError(
- "When using --ramp-up-strategy, both --ramp-up-start-rps and "
- "--ramp-up-end-rps must be specified"
- )
- if args.ramp_up_start_rps < 0 or args.ramp_up_end_rps < 0:
- raise ValueError("Ramp-up start and end RPS must be non-negative")
- if args.ramp_up_start_rps > args.ramp_up_end_rps:
- raise ValueError("Ramp-up start RPS must be less than end RPS")
- if args.ramp_up_strategy == "exponential" and args.ramp_up_start_rps == 0:
- raise ValueError("For exponential ramp-up, the start RPS cannot be 0.")
-
- if args.base_url is not None:
- api_url = f"{args.base_url}{args.endpoint}"
- base_url = f"{args.base_url}"
- else:
- api_url = f"http://{args.host}:{args.port}{args.endpoint}"
- base_url = f"http://{args.host}:{args.port}"
-
- tokenizer = get_tokenizer(
- tokenizer_id,
- tokenizer_mode=tokenizer_mode,
- trust_remote_code=args.trust_remote_code,
- )
-
- if args.dataset_name is None:
- raise ValueError(
- "Please specify '--dataset-name' and the corresponding "
- "'--dataset-path' if required."
- )
-
- if args.dataset_name == "custom":
- dataset = CustomDataset(dataset_path=args.dataset_path)
- input_requests = dataset.sample(
- num_requests=args.num_prompts,
- tokenizer=tokenizer,
- output_len=args.custom_output_len,
- skip_chat_template=args.custom_skip_chat_template,
- request_id_prefix=args.request_id_prefix,
- )
-
- elif args.dataset_name == "sonnet":
- dataset = SonnetDataset(dataset_path=args.dataset_path)
- # For the "sonnet" dataset, formatting depends on the backend.
- if args.backend == "openai-chat":
- input_requests = dataset.sample(
- num_requests=args.num_prompts,
- input_len=args.sonnet_input_len,
- output_len=args.sonnet_output_len,
- prefix_len=args.sonnet_prefix_len,
- tokenizer=tokenizer,
- return_prompt_formatted=False,
- request_id_prefix=args.request_id_prefix,
- )
- else:
- assert tokenizer.chat_template or tokenizer.default_chat_template, (
- "Tokenizer/model must have chat template for sonnet dataset."
- )
- input_requests = dataset.sample(
- num_requests=args.num_prompts,
- input_len=args.sonnet_input_len,
- output_len=args.sonnet_output_len,
- prefix_len=args.sonnet_prefix_len,
- tokenizer=tokenizer,
- return_prompt_formatted=True,
- request_id_prefix=args.request_id_prefix,
- )
-
- elif args.dataset_name == "hf":
- # all following datasets are implemented from the
- # HuggingFaceDataset base class
- if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
- dataset_class = VisionArenaDataset
- args.hf_split = "train"
- args.hf_subset = None
- elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
- dataset_class = InstructCoderDataset
- args.hf_split = "train"
- elif args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS:
- dataset_class = MTBenchDataset
- args.hf_split = "train"
- elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
- dataset_class = ConversationDataset
- elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
- dataset_class = AIMODataset
- args.hf_split = "train"
- elif args.dataset_path in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS: # noqa: E501
- dataset_class = NextEditPredictionDataset
- args.hf_split = "train"
- elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS:
- dataset_class = ASRDataset
- args.hf_split = "train"
- else:
- supported_datasets = set(
- [
- dataset_name
- for cls in HuggingFaceDataset.__subclasses__()
- for dataset_name in cls.SUPPORTED_DATASET_PATHS
- ]
- )
- raise ValueError(
- f"Unsupported dataset path: {args.dataset_path}. "
- "Huggingface dataset only supports dataset_path"
- f" from one of following: {supported_datasets}. "
- "Please consider contributing if you would "
- "like to add support for additional dataset formats."
- )
-
- if dataset_class.IS_MULTIMODAL and backend not in [
- "openai-chat",
- "openai-audio",
- ]:
- # multi-modal benchmark is only available on OpenAI Chat backend.
- raise ValueError(
- "Multi-modal content is only supported on 'openai-chat' and "
- "'openai-audio' backend."
- )
- input_requests = dataset_class(
- dataset_path=args.dataset_path,
- dataset_subset=args.hf_subset,
- dataset_split=args.hf_split,
- random_seed=args.seed,
- no_stream=args.no_stream,
- ).sample(
- num_requests=args.num_prompts,
- tokenizer=tokenizer,
- output_len=args.hf_output_len,
- request_id_prefix=args.request_id_prefix,
- )
-
- else:
- # For datasets that follow a similar structure, use a mapping.
- dataset_mapping = {
- "sharegpt": lambda: ShareGPTDataset(
- random_seed=args.seed, dataset_path=args.dataset_path
- ).sample(
- tokenizer=tokenizer,
- num_requests=args.num_prompts,
- output_len=args.sharegpt_output_len,
- request_id_prefix=args.request_id_prefix,
- ),
- "burstgpt": lambda: BurstGPTDataset(
- random_seed=args.seed, dataset_path=args.dataset_path
- ).sample(
- tokenizer=tokenizer,
- num_requests=args.num_prompts,
- request_id_prefix=args.request_id_prefix,
- ),
- "random": lambda: RandomDataset(dataset_path=args.dataset_path).sample(
- tokenizer=tokenizer,
- num_requests=args.num_prompts,
- prefix_len=args.random_prefix_len,
- input_len=args.random_input_len,
- output_len=args.random_output_len,
- range_ratio=args.random_range_ratio,
- request_id_prefix=args.request_id_prefix,
- ),
- }
-
- try:
- input_requests = dataset_mapping[args.dataset_name]()
- except KeyError as err:
- raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
- goodput_config_dict = check_goodput_args(args)
-
- # Collect the sampling parameters.
- sampling_params = {
- k: v
- for k, v in {
- "top_p": args.top_p,
- "top_k": args.top_k,
- "min_p": args.min_p,
- "temperature": args.temperature,
- }.items()
- if v is not None
- }
-
- # Sampling parameters are only supported by openai-compatible backend.
- if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
- raise ValueError(
- "Sampling parameters are only supported by openai-compatible backends."
- )
-
- if "temperature" not in sampling_params:
- sampling_params["temperature"] = 0.0 # Default to greedy decoding.
-
- if args.backend == "llama.cpp":
- # Disable prompt caching in llama.cpp backend
- sampling_params["cache_prompt"] = False
-
- # Avoid GC processing "static" data - reduce pause times.
- gc.collect()
- gc.freeze()
-
- benchmark_result = asyncio.run(
- benchmark(
- backend=backend,
- api_url=api_url,
- base_url=base_url,
- model_id=model_id,
- model_name=model_name,
- tokenizer=tokenizer,
- input_requests=input_requests,
- logprobs=args.logprobs,
- request_rate=args.request_rate,
- burstiness=args.burstiness,
- disable_tqdm=args.disable_tqdm,
- profile=args.profile,
- selected_percentile_metrics=args.percentile_metrics.split(","),
- selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")],
- ignore_eos=args.ignore_eos,
- goodput_config_dict=goodput_config_dict,
- max_concurrency=args.max_concurrency,
- lora_modules=args.lora_modules,
- extra_body=sampling_params,
- ramp_up_strategy=args.ramp_up_strategy,
- ramp_up_start_rps=args.ramp_up_start_rps,
- ramp_up_end_rps=args.ramp_up_end_rps,
- )
- )
-
- # Save config and results to json
- if args.save_result or args.append_result:
- result_json: dict[str, Any] = {}
-
- # Setup
- current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
- result_json["date"] = current_dt
- result_json["backend"] = backend
- result_json["model_id"] = model_id
- result_json["tokenizer_id"] = tokenizer_id
- result_json["num_prompts"] = args.num_prompts
-
- # Metadata
- if args.metadata:
- for item in args.metadata:
- if "=" in item:
- kvstring = item.split("=")
- result_json[kvstring[0].strip()] = kvstring[1].strip()
- else:
- raise ValueError(
- "Invalid metadata format. Please use KEY=VALUE format."
- )
- # Traffic
- result_json["request_rate"] = (
- args.request_rate if args.request_rate < float("inf") else "inf"
- )
- result_json["burstiness"] = args.burstiness
- result_json["max_concurrency"] = args.max_concurrency
-
- if args.ramp_up_strategy is not None:
- result_json["ramp_up_strategy"] = args.ramp_up_strategy
- result_json["ramp_up_start_rps"] = args.ramp_up_start_rps
- result_json["ramp_up_end_rps"] = args.ramp_up_end_rps
-
- # Merge with benchmark result
- result_json = {**result_json, **benchmark_result}
-
- if not args.save_detailed:
- # Remove fields with too many data points
- for field in [
- "input_lens",
- "output_lens",
- "ttfts",
- "itls",
- "generated_texts",
- "errors",
- ]:
- if field in result_json:
- del result_json[field]
- if field in benchmark_result:
- del benchmark_result[field]
-
- # Save to file
- base_model_id = model_id.split("/")[-1]
- max_concurrency_str = (
- f"-concurrency{args.max_concurrency}"
- if args.max_concurrency is not None
- else ""
- )
- if args.ramp_up_strategy is not None:
- file_name = f"{backend}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa
- else:
- file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa
- if args.result_filename:
- file_name = args.result_filename
- if args.result_dir:
- os.makedirs(args.result_dir, exist_ok=True)
- file_name = os.path.join(args.result_dir, file_name)
- with open(
- file_name, mode="a+" if args.append_result else "w", encoding="utf-8"
- ) as outfile:
- # Append a newline.
- if args.append_result and outfile.tell() != 0:
- outfile.write("\n")
- json.dump(result_json, outfile)
- save_to_pytorch_benchmark_format(args, result_json, file_name)
-
-
-def create_argument_parser():
- parser = FlexibleArgumentParser(
- description="Benchmark the online serving throughput."
- )
- parser.add_argument(
- "--backend",
- type=str,
- default="vllm",
- choices=list(ASYNC_REQUEST_FUNCS.keys()),
- )
- parser.add_argument(
- "--base-url",
- type=str,
- default=None,
- help="Server or API base url if not using http host and port.",
- )
- # Use 127.0.0.1 here instead of localhost to force the use of ipv4
- parser.add_argument("--host", type=str, default="127.0.0.1")
- parser.add_argument("--port", type=int, default=8000)
- parser.add_argument(
- "--endpoint",
- type=str,
- default="/v1/completions",
- help="API endpoint.",
- )
- parser.add_argument(
- "--dataset-name",
- type=str,
- default="sharegpt",
- choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"],
- help="Name of the dataset to benchmark on.",
- )
- parser.add_argument(
- "--dataset-path",
- type=str,
- default=None,
- help="Path to the sharegpt/sonnet dataset. "
- "Or the huggingface dataset ID if using HF dataset.",
- )
- parser.add_argument(
- "--no-stream",
- action="store_true",
- help="Do not load the dataset in streaming mode.",
- )
- parser.add_argument(
- "--max-concurrency",
- type=int,
- default=None,
- help="Maximum number of concurrent requests. This can be used "
- "to help simulate an environment where a higher level component "
- "is enforcing a maximum number of concurrent requests. While the "
- "--request-rate argument controls the rate at which requests are "
- "initiated, this argument will control how many are actually allowed "
- "to execute at a time. This means that when used in combination, the "
- "actual request rate may be lower than specified with --request-rate, "
- "if the server is not processing requests fast enough to keep up.",
- )
-
- parser.add_argument(
- "--model",
- type=str,
- required=True,
- help="Name of the model.",
- )
- parser.add_argument(
- "--tokenizer",
- type=str,
- help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
- )
- parser.add_argument("--use-beam-search", action="store_true")
- parser.add_argument(
- "--num-prompts",
- type=int,
- default=1000,
- help="Number of prompts to process.",
- )
- parser.add_argument(
- "--logprobs",
- type=int,
- default=None,
- help=(
- "Number of logprobs-per-token to compute & return as part of "
- "the request. If unspecified, then either (1) if beam search "
- "is disabled, no logprobs are computed & a single dummy "
- "logprob is returned for each token; or (2) if beam search "
- "is enabled 1 logprob per token is computed"
- ),
- )
- parser.add_argument(
- "--request-rate",
- type=float,
- default=float("inf"),
- help="Number of requests per second. If this is inf, "
- "then all the requests are sent at time 0. "
- "Otherwise, we use Poisson process or gamma distribution "
- "to synthesize the request arrival times.",
- )
- parser.add_argument(
- "--burstiness",
- type=float,
- default=1.0,
- help="Burstiness factor of the request generation. "
- "Only take effect when request_rate is not inf. "
- "Default value is 1, which follows Poisson process. "
- "Otherwise, the request intervals follow a gamma distribution. "
- "A lower burstiness value (0 < burstiness < 1) results in more "
- "bursty requests. A higher burstiness value (burstiness > 1) "
- "results in a more uniform arrival of requests.",
- )
- parser.add_argument("--seed", type=int, default=0)
- parser.add_argument(
- "--trust-remote-code",
- action="store_true",
- help="Trust remote code from huggingface",
- )
- parser.add_argument(
- "--disable-tqdm",
- action="store_true",
- help="Specify to disable tqdm progress bar.",
- )
- parser.add_argument(
- "--profile",
- action="store_true",
- help="Use Torch Profiler. The endpoint must be launched with "
- "VLLM_TORCH_PROFILER_DIR to enable profiler.",
- )
- parser.add_argument(
- "--save-result",
- action="store_true",
- help="Specify to save benchmark results to a json file",
- )
- parser.add_argument(
- "--save-detailed",
- action="store_true",
- help="When saving the results, whether to include per request "
- "information such as response, error, ttfs, tpots, etc.",
- )
- parser.add_argument(
- "--append-result",
- action="store_true",
- help="Append the benchmark result to the existing json file.",
- )
- parser.add_argument(
- "--metadata",
- metavar="KEY=VALUE",
- nargs="*",
- help="Key-value pairs (e.g, --metadata version=0.3.3 tp=1) "
- "for metadata of this run to be saved in the result JSON file "
- "for record keeping purposes.",
- )
- parser.add_argument(
- "--result-dir",
- type=str,
- default=None,
- help="Specify directory to save benchmark json results."
- "If not specified, results are saved in the current directory.",
- )
- parser.add_argument(
- "--result-filename",
- type=str,
- default=None,
- help="Specify the filename to save benchmark json results."
- "If not specified, results will be saved in "
- "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
- " format.",
- )
- parser.add_argument(
- "--ignore-eos",
- action="store_true",
- help="Set ignore_eos flag when sending the benchmark request."
- "Warning: ignore_eos is not supported in deepspeed_mii and tgi.",
- )
- parser.add_argument(
- "--percentile-metrics",
- type=str,
- default="ttft,tpot,itl",
- help="Comma-separated list of selected metrics to report percentils. "
- "This argument specifies the metrics to report percentiles. "
- 'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
- 'Default value is "ttft,tpot,itl".',
- )
- parser.add_argument(
- "--metric-percentiles",
- type=str,
- default="99",
- help="Comma-separated list of percentiles for selected metrics. "
- 'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". '
- 'Default value is "99". '
- 'Use "--percentile-metrics" to select metrics.',
- )
- parser.add_argument(
- "--goodput",
- nargs="+",
- required=False,
- help='Specify service level objectives for goodput as "KEY:VALUE" '
- "pairs, where the key is a metric name, and the value is in "
- 'milliseconds. Multiple "KEY:VALUE" pairs can be provided, '
- "separated by spaces. Allowed request level metric names are "
- '"ttft", "tpot", "e2el". For more context on the definition of '
- "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
- "and the blog: https://hao-ai-lab.github.io/blogs/distserve",
- )
- parser.add_argument(
- "--request-id-prefix",
- type=str,
- required=False,
- default="benchmark-serving",
- help="Specify the prefix of request id.",
- )
-
- # group for dataset specific arguments
- custom_group = parser.add_argument_group("custom dataset options")
- custom_group.add_argument(
- "--custom-output-len",
- type=int,
- default=256,
- help="Number of output tokens per request, used only for custom dataset.",
- )
- custom_group.add_argument(
- "--custom-skip-chat-template",
- action="store_true",
- help="Skip applying chat template to prompt, used only for custom dataset.",
- )
-
- sonnet_group = parser.add_argument_group("sonnet dataset options")
- sonnet_group.add_argument(
- "--sonnet-input-len",
- type=int,
- default=550,
- help="Number of input tokens per request, used only for sonnet dataset.",
- )
- sonnet_group.add_argument(
- "--sonnet-output-len",
- type=int,
- default=150,
- help="Number of output tokens per request, used only for sonnet dataset.",
- )
- sonnet_group.add_argument(
- "--sonnet-prefix-len",
- type=int,
- default=200,
- help="Number of prefix tokens per request, used only for sonnet dataset.",
- )
-
- sharegpt_group = parser.add_argument_group("sharegpt dataset options")
- sharegpt_group.add_argument(
- "--sharegpt-output-len",
- type=int,
- default=None,
- help="Output length for each request. Overrides the output length "
- "from the ShareGPT dataset.",
- )
-
- random_group = parser.add_argument_group("random dataset options")
- random_group.add_argument(
- "--random-input-len",
- type=int,
- default=1024,
- help="Number of input tokens per request, used only for random sampling.",
- )
- random_group.add_argument(
- "--random-output-len",
- type=int,
- default=128,
- help="Number of output tokens per request, used only for random sampling.",
- )
- random_group.add_argument(
- "--random-range-ratio",
- type=float,
- default=0.0,
- help="Range ratio for sampling input/output length, "
- "used only for random sampling. Must be in the range [0, 1) to define "
- "a symmetric sampling range"
- "[length * (1 - range_ratio), length * (1 + range_ratio)].",
- )
- random_group.add_argument(
- "--random-prefix-len",
- type=int,
- default=0,
- help=(
- "Number of fixed prefix tokens before the random context "
- "in a request. "
- "The total input length is the sum of `random-prefix-len` and "
- "a random "
- "context length sampled from [input_len * (1 - range_ratio), "
- "input_len * (1 + range_ratio)]."
- ),
- )
-
- hf_group = parser.add_argument_group("hf dataset options")
- hf_group.add_argument(
- "--hf-subset", type=str, default=None, help="Subset of the HF dataset."
- )
- hf_group.add_argument(
- "--hf-split", type=str, default=None, help="Split of the HF dataset."
- )
- hf_group.add_argument(
- "--hf-output-len",
- type=int,
- default=None,
- help="Output length for each request. Overrides the output lengths "
- "from the sampled HF dataset.",
- )
-
- sampling_group = parser.add_argument_group("sampling parameters")
- sampling_group.add_argument(
- "--top-p",
- type=float,
- default=None,
- help="Top-p sampling parameter. Only has effect on openai-compatible backends.",
- )
- sampling_group.add_argument(
- "--top-k",
- type=int,
- default=None,
- help="Top-k sampling parameter. Only has effect on openai-compatible backends.",
- )
- sampling_group.add_argument(
- "--min-p",
- type=float,
- default=None,
- help="Min-p sampling parameter. Only has effect on openai-compatible backends.",
- )
- sampling_group.add_argument(
- "--temperature",
- type=float,
- default=None,
- help="Temperature sampling parameter. Only has effect on "
- "openai-compatible backends. If not specified, default to greedy "
- "decoding (i.e. temperature==0.0).",
- )
-
- parser.add_argument(
- "--tokenizer-mode",
- type=str,
- default="auto",
- choices=["auto", "slow", "mistral", "custom"],
- help='The tokenizer mode.\n\n* "auto" will use the '
- 'fast tokenizer if available.\n* "slow" will '
- "always use the slow tokenizer. \n* "
- '"mistral" will always use the `mistral_common` tokenizer. \n*'
- '"custom" will use --tokenizer to select the preregistered tokenizer.',
- )
-
- parser.add_argument(
- "--served-model-name",
- type=str,
- default=None,
- help="The model name used in the API. "
- "If not specified, the model name will be the "
- "same as the ``--model`` argument. ",
- )
-
- parser.add_argument(
- "--lora-modules",
- nargs="+",
- default=None,
- help="A subset of LoRA module names passed in when "
- "launching the server. For each request, the "
- "script chooses a LoRA module at random.",
- )
-
- parser.add_argument(
- "--ramp-up-strategy",
- type=str,
- default=None,
- choices=["linear", "exponential"],
- help="The ramp-up strategy. This would be used to "
- "ramp up the request rate from initial RPS to final "
- "RPS rate (specified by --ramp-up-start-rps and --ramp-up-end-rps). "
- "over the duration of the benchmark.",
- )
- parser.add_argument(
- "--ramp-up-start-rps",
- type=int,
- default=None,
- help="The starting request rate for ramp-up (RPS). "
- "Needs to be specified when --ramp-up-strategy is used.",
- )
- parser.add_argument(
- "--ramp-up-end-rps",
- type=int,
- default=None,
- help="The ending request rate for ramp-up (RPS). "
- "Needs to be specified when --ramp-up-strategy is used.",
- )
-
- return parser
-
+import sys
if __name__ == "__main__":
- parser = create_argument_parser()
- args = parser.parse_args()
- main(args)
+ print("""DEPRECATED: This script has been moved to the vLLM CLI.
+
+Please use the following command instead:
+ vllm bench serve
+
+For help with the new command, run:
+ vllm bench serve --help
+
+Alternatively, you can run the new command directly with:
+ python -m vllm.entrypoints.cli.main bench serve --help
+""")
+ sys.exit(1)
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index ca6843a72..4aae755eb 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -998,7 +998,7 @@ def create_argument_parser():
"--percentile-metrics",
type=str,
default="ttft,tpot,itl",
- help="Comma-separated list of selected metrics to report percentils. "
+ help="Comma-separated list of selected metrics to report percentiles. "
"This argument specifies the metrics to report percentiles. "
'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
'Default value is "ttft,tpot,itl".',
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 6b24b8c8f..b6dc0918f 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -1,741 +1,17 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Benchmark offline inference throughput."""
-
-import argparse
-import dataclasses
-import json
-import os
-import random
-import time
-import warnings
-from typing import Any, Optional, Union
-
-import torch
-import uvloop
-from tqdm import tqdm
-from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
-from typing_extensions import deprecated
-
-from benchmark_dataset import (
- AIMODataset,
- BurstGPTDataset,
- ConversationDataset,
- InstructCoderDataset,
- RandomDataset,
- SampleRequest,
- ShareGPTDataset,
- SonnetDataset,
- VisionArenaDataset,
-)
-from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
-from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
-from vllm.entrypoints.openai.api_server import (
- build_async_engine_client_from_engine_args,
-)
-from vllm.inputs import TextPrompt, TokensPrompt
-from vllm.lora.request import LoRARequest
-from vllm.outputs import RequestOutput
-from vllm.sampling_params import BeamSearchParams
-from vllm.utils import FlexibleArgumentParser, merge_async_iterators
-
-
-def run_vllm(
- requests: list[SampleRequest],
- n: int,
- engine_args: EngineArgs,
- disable_detokenize: bool = False,
-) -> tuple[float, Optional[list[RequestOutput]]]:
- from vllm import LLM, SamplingParams
-
- llm = LLM(**dataclasses.asdict(engine_args))
- assert all(
- llm.llm_engine.model_config.max_model_len
- >= (request.prompt_len + request.expected_output_len)
- for request in requests
- ), (
- "Please ensure that max_model_len is greater than the sum of"
- " prompt_len and expected_output_len for all requests."
- )
- # Add the requests to the engine.
- prompts: list[Union[TextPrompt, TokensPrompt]] = []
- sampling_params: list[SamplingParams] = []
- for request in requests:
- prompts.append(
- TokensPrompt(
- prompt_token_ids=request.prompt["prompt_token_ids"],
- multi_modal_data=request.multi_modal_data,
- )
- if "prompt_token_ids" in request.prompt
- else TextPrompt(
- prompt=request.prompt, multi_modal_data=request.multi_modal_data
- )
- )
- sampling_params.append(
- SamplingParams(
- n=n,
- temperature=1.0,
- top_p=1.0,
- ignore_eos=True,
- max_tokens=request.expected_output_len,
- detokenize=not disable_detokenize,
- )
- )
- lora_requests: Optional[list[LoRARequest]] = None
- if engine_args.enable_lora:
- lora_requests = [request.lora_request for request in requests]
-
- use_beam_search = False
-
- outputs = None
- if not use_beam_search:
- start = time.perf_counter()
- outputs = llm.generate(
- prompts, sampling_params, lora_request=lora_requests, use_tqdm=True
- )
- end = time.perf_counter()
- else:
- assert lora_requests is None, "BeamSearch API does not support LoRA"
- # output_len should be the same for all requests.
- output_len = requests[0].expected_output_len
- for request in requests:
- assert request.expected_output_len == output_len
- start = time.perf_counter()
- llm.beam_search(
- prompts,
- BeamSearchParams(
- beam_width=n,
- max_tokens=output_len,
- ignore_eos=True,
- ),
- )
- end = time.perf_counter()
- return end - start, outputs
-
-
-def run_vllm_chat(
- requests: list[SampleRequest],
- n: int,
- engine_args: EngineArgs,
- disable_detokenize: bool = False,
-) -> tuple[float, list[RequestOutput]]:
- """
- Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
- multimodal models as it properly handles multimodal inputs and chat
- formatting. For non-multimodal models, use run_vllm() instead.
- """
- from vllm import LLM, SamplingParams
-
- llm = LLM(**dataclasses.asdict(engine_args))
-
- assert all(
- llm.llm_engine.model_config.max_model_len
- >= (request.prompt_len + request.expected_output_len)
- for request in requests
- ), (
- "Please ensure that max_model_len is greater than the sum of "
- "prompt_len and expected_output_len for all requests."
- )
-
- prompts = []
- sampling_params: list[SamplingParams] = []
- for request in requests:
- prompts.append(request.prompt)
- sampling_params.append(
- SamplingParams(
- n=n,
- temperature=1.0,
- top_p=1.0,
- ignore_eos=True,
- max_tokens=request.expected_output_len,
- detokenize=not disable_detokenize,
- )
- )
- start = time.perf_counter()
- outputs = llm.chat(prompts, sampling_params, use_tqdm=True)
- end = time.perf_counter()
- return end - start, outputs
-
-
-async def run_vllm_async(
- requests: list[SampleRequest],
- n: int,
- engine_args: AsyncEngineArgs,
- disable_frontend_multiprocessing: bool = False,
- disable_detokenize: bool = False,
-) -> float:
- from vllm import SamplingParams
-
- async with build_async_engine_client_from_engine_args(
- engine_args,
- disable_frontend_multiprocessing=disable_frontend_multiprocessing,
- ) as llm:
- model_config = await llm.get_model_config()
- assert all(
- model_config.max_model_len
- >= (request.prompt_len + request.expected_output_len)
- for request in requests
- ), (
- "Please ensure that max_model_len is greater than the sum of"
- " prompt_len and expected_output_len for all requests."
- )
-
- # Add the requests to the engine.
- prompts: list[Union[TextPrompt, TokensPrompt]] = []
- sampling_params: list[SamplingParams] = []
- lora_requests: list[Optional[LoRARequest]] = []
- for request in requests:
- prompts.append(
- TokensPrompt(
- prompt_token_ids=request.prompt["prompt_token_ids"],
- multi_modal_data=request.multi_modal_data,
- )
- if "prompt_token_ids" in request.prompt
- else TextPrompt(
- prompt=request.prompt, multi_modal_data=request.multi_modal_data
- )
- )
- sampling_params.append(
- SamplingParams(
- n=n,
- temperature=1.0,
- top_p=1.0,
- ignore_eos=True,
- max_tokens=request.expected_output_len,
- detokenize=not disable_detokenize,
- )
- )
- lora_requests.append(request.lora_request)
-
- generators = []
- start = time.perf_counter()
- for i, (prompt, sp, lr) in enumerate(
- zip(prompts, sampling_params, lora_requests)
- ):
- generator = llm.generate(prompt, sp, lora_request=lr, request_id=f"test{i}")
- generators.append(generator)
- all_gens = merge_async_iterators(*generators)
- async for i, res in all_gens:
- pass
- end = time.perf_counter()
- return end - start
-
-
-def run_hf(
- requests: list[SampleRequest],
- model: str,
- tokenizer: PreTrainedTokenizerBase,
- n: int,
- max_batch_size: int,
- trust_remote_code: bool,
- disable_detokenize: bool = False,
-) -> float:
- llm = AutoModelForCausalLM.from_pretrained(
- model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code
- )
- if llm.config.model_type == "llama":
- # To enable padding in the HF backend.
- tokenizer.pad_token = tokenizer.eos_token
- llm = llm.cuda()
-
- pbar = tqdm(total=len(requests))
- start = time.perf_counter()
- batch: list[str] = []
- max_prompt_len = 0
- max_output_len = 0
- for i in range(len(requests)):
- prompt = requests[i].prompt
- prompt_len = requests[i].prompt_len
- output_len = requests[i].expected_output_len
- # Add the prompt to the batch.
- batch.append(prompt)
- max_prompt_len = max(max_prompt_len, prompt_len)
- max_output_len = max(max_output_len, output_len)
- if len(batch) < max_batch_size and i != len(requests) - 1:
- # Check if we can add more requests to the batch.
- next_prompt_len = requests[i + 1].prompt_len
- next_output_len = requests[i + 1].expected_output_len
- if (
- max(max_prompt_len, next_prompt_len)
- + max(max_output_len, next_output_len)
- ) <= 2048:
- # We can add more requests to the batch.
- continue
-
- # Generate the sequences.
- input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids
- llm_outputs = llm.generate(
- input_ids=input_ids.cuda(),
- do_sample=True,
- num_return_sequences=n,
- temperature=1.0,
- top_p=1.0,
- use_cache=True,
- max_new_tokens=max_output_len,
- )
- if not disable_detokenize:
- # Include the decoding time.
- tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
- pbar.update(len(batch))
-
- # Clear the batch.
- batch = []
- max_prompt_len = 0
- max_output_len = 0
- end = time.perf_counter()
- return end - start
-
-
-def run_mii(
- requests: list[SampleRequest],
- model: str,
- tensor_parallel_size: int,
- output_len: int,
-) -> float:
- from mii import client, serve
-
- llm = serve(model, tensor_parallel=tensor_parallel_size)
- prompts = [request.prompt for request in requests]
-
- start = time.perf_counter()
- llm.generate(prompts, max_new_tokens=output_len)
- end = time.perf_counter()
- client = client(model)
- client.terminate_server()
- return end - start
-
-
-def save_to_pytorch_benchmark_format(
- args: argparse.Namespace, results: dict[str, Any]
-) -> None:
- pt_records = convert_to_pytorch_benchmark_format(
- args=args,
- metrics={
- "requests_per_second": [results["requests_per_second"]],
- "tokens_per_second": [results["tokens_per_second"]],
- },
- extra_info={
- k: results[k] for k in ["elapsed_time", "num_requests", "total_num_tokens"]
- },
- )
- if pt_records:
- # Don't use json suffix here as we don't want CI to pick it up
- pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
- write_to_json(pt_file, pt_records)
-
-
-def get_requests(args, tokenizer):
- # Common parameters for all dataset types.
- common_kwargs = {
- "dataset_path": args.dataset_path,
- "random_seed": args.seed,
- }
- sample_kwargs = {
- "tokenizer": tokenizer,
- "lora_path": args.lora_path,
- "max_loras": args.max_loras,
- "num_requests": args.num_prompts,
- "input_len": args.input_len,
- "output_len": args.output_len,
- }
-
- if args.dataset_path is None or args.dataset_name == "random":
- sample_kwargs["range_ratio"] = args.random_range_ratio
- sample_kwargs["prefix_len"] = args.prefix_len
- dataset_cls = RandomDataset
- elif args.dataset_name == "sharegpt":
- dataset_cls = ShareGPTDataset
- if args.backend == "vllm-chat":
- sample_kwargs["enable_multimodal_chat"] = True
- elif args.dataset_name == "sonnet":
- assert tokenizer.chat_template or tokenizer.default_chat_template, (
- "Tokenizer/model must have chat template for sonnet dataset."
- )
- dataset_cls = SonnetDataset
- sample_kwargs["prefix_len"] = args.prefix_len
- sample_kwargs["return_prompt_formatted"] = True
- elif args.dataset_name == "burstgpt":
- dataset_cls = BurstGPTDataset
- elif args.dataset_name == "hf":
- common_kwargs["no_stream"] = args.no_stream
- if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
- dataset_cls = VisionArenaDataset
- common_kwargs["dataset_subset"] = None
- common_kwargs["dataset_split"] = "train"
- sample_kwargs["enable_multimodal_chat"] = True
- elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
- dataset_cls = InstructCoderDataset
- common_kwargs["dataset_split"] = "train"
- elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
- dataset_cls = ConversationDataset
- common_kwargs["dataset_subset"] = args.hf_subset
- common_kwargs["dataset_split"] = args.hf_split
- sample_kwargs["enable_multimodal_chat"] = True
- elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
- dataset_cls = AIMODataset
- common_kwargs["dataset_subset"] = None
- common_kwargs["dataset_split"] = "train"
- else:
- raise ValueError(f"Unknown dataset name: {args.dataset_name}")
- # Remove None values
- sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None}
- return dataset_cls(**common_kwargs).sample(**sample_kwargs)
-
-
-@deprecated(
- "benchmark_throughput.py is deprecated and will be removed in a "
- "future version. Please use 'vllm bench throughput' instead.",
-)
-def main(args: argparse.Namespace):
- if args.seed is None:
- args.seed = 0
- print(args)
- random.seed(args.seed)
- # Sample the requests.
- tokenizer = AutoTokenizer.from_pretrained(
- args.tokenizer, trust_remote_code=args.trust_remote_code
- )
- requests = get_requests(args, tokenizer)
- is_multi_modal = any(request.multi_modal_data is not None for request in requests)
- request_outputs: Optional[list[RequestOutput]] = None
- if args.backend == "vllm":
- if args.async_engine:
- elapsed_time = uvloop.run(
- run_vllm_async(
- requests,
- args.n,
- AsyncEngineArgs.from_cli_args(args),
- args.disable_frontend_multiprocessing,
- args.disable_detokenize,
- )
- )
- else:
- elapsed_time, request_outputs = run_vllm(
- requests,
- args.n,
- EngineArgs.from_cli_args(args),
- args.disable_detokenize,
- )
- elif args.backend == "hf":
- assert args.tensor_parallel_size == 1
- elapsed_time = run_hf(
- requests,
- args.model,
- tokenizer,
- args.n,
- args.hf_max_batch_size,
- args.trust_remote_code,
- args.disable_detokenize,
- )
- elif args.backend == "mii":
- elapsed_time = run_mii(
- requests, args.model, args.tensor_parallel_size, args.output_len
- )
- elif args.backend == "vllm-chat":
- elapsed_time, request_outputs = run_vllm_chat(
- requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize
- )
- else:
- raise ValueError(f"Unknown backend: {args.backend}")
-
- if request_outputs:
- # Note: with the vllm and vllm-chat backends,
- # we have request_outputs, which we use to count tokens.
- total_prompt_tokens = 0
- total_output_tokens = 0
- for ro in request_outputs:
- if not isinstance(ro, RequestOutput):
- continue
- total_prompt_tokens += (
- len(ro.prompt_token_ids) if ro.prompt_token_ids else 0
- )
- total_output_tokens += sum(len(o.token_ids) for o in ro.outputs if o)
- total_num_tokens = total_prompt_tokens + total_output_tokens
- else:
- total_num_tokens = sum(r.prompt_len + r.expected_output_len for r in requests)
- total_output_tokens = sum(r.expected_output_len for r in requests)
- total_prompt_tokens = total_num_tokens - total_output_tokens
-
- if is_multi_modal and args.backend != "vllm-chat":
- print(
- "\033[91mWARNING\033[0m: Multi-modal request with "
- f"{args.backend} backend detected. The "
- "following metrics are not accurate because image tokens are not"
- " counted. See vllm-project/vllm/issues/9778 for details."
- )
- # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
- # vllm-chat backend counts the image tokens now
-
- print(
- f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
- f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
- f"{total_output_tokens / elapsed_time:.2f} output tokens/s"
- )
- print(f"Total num prompt tokens: {total_prompt_tokens}")
- print(f"Total num output tokens: {total_output_tokens}")
-
- # Output JSON results if specified
- if args.output_json:
- results = {
- "elapsed_time": elapsed_time,
- "num_requests": len(requests),
- "total_num_tokens": total_num_tokens,
- "requests_per_second": len(requests) / elapsed_time,
- "tokens_per_second": total_num_tokens / elapsed_time,
- }
- with open(args.output_json, "w") as f:
- json.dump(results, f, indent=4)
- save_to_pytorch_benchmark_format(args, results)
-
-
-def validate_args(args):
- """
- Validate command-line arguments.
- """
-
- # === Deprecation and Defaulting ===
- if args.dataset is not None:
- warnings.warn(
- "The '--dataset' argument will be deprecated in the next release. "
- "Please use '--dataset-name' and '--dataset-path' instead.",
- stacklevel=2,
- )
- args.dataset_path = args.dataset
-
- if not getattr(args, "tokenizer", None):
- args.tokenizer = args.model
-
- # === Backend Validation ===
- valid_backends = {"vllm", "hf", "mii", "vllm-chat"}
- if args.backend not in valid_backends:
- raise ValueError(f"Unsupported backend: {args.backend}")
-
- # === Dataset Configuration ===
- if not args.dataset and not args.dataset_path:
- print("When dataset path is not set, it will default to random dataset")
- args.dataset_name = "random"
- if args.input_len is None:
- raise ValueError("input_len must be provided for a random dataset")
-
- # === Dataset Name Specific Checks ===
- # --hf-subset and --hf-split: only used
- # when dataset_name is 'hf'
- if args.dataset_name != "hf" and (
- getattr(args, "hf_subset", None) is not None
- or getattr(args, "hf_split", None) is not None
- ):
- warnings.warn(
- "--hf-subset and --hf-split will be ignored \
- since --dataset-name is not 'hf'.",
- stacklevel=2,
- )
- elif args.dataset_name == "hf":
- if args.dataset_path in (
- VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
- | ConversationDataset.SUPPORTED_DATASET_PATHS
- ):
- assert args.backend == "vllm-chat", (
- f"{args.dataset_path} needs to use vllm-chat as the backend."
- ) # noqa: E501
- elif args.dataset_path in (
- InstructCoderDataset.SUPPORTED_DATASET_PATHS
- | AIMODataset.SUPPORTED_DATASET_PATHS
- ):
- assert args.backend == "vllm", (
- f"{args.dataset_path} needs to use vllm as the backend."
- ) # noqa: E501
- else:
- raise ValueError(f"{args.dataset_path} is not supported by hf dataset.")
-
- # --random-range-ratio: only used when dataset_name is 'random'
- if args.dataset_name != "random" and args.random_range_ratio is not None:
- warnings.warn(
- "--random-range-ratio will be ignored since \
- --dataset-name is not 'random'.",
- stacklevel=2,
- )
-
- # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
- # set.
- if (
- args.dataset_name not in {"random", "sonnet", None}
- and args.prefix_len is not None
- ):
- warnings.warn(
- "--prefix-len will be ignored since --dataset-name\
- is not 'random', 'sonnet', or not set.",
- stacklevel=2,
- )
-
- # === LoRA Settings ===
- if getattr(args, "enable_lora", False) and args.backend != "vllm":
- raise ValueError("LoRA benchmarking is only supported for vLLM backend")
- if getattr(args, "enable_lora", False) and args.lora_path is None:
- raise ValueError("LoRA path must be provided when enable_lora is True")
-
- # === Backend-specific Validations ===
- if args.backend == "hf" and args.hf_max_batch_size is None:
- raise ValueError("HF max batch size is required for HF backend")
- if args.backend != "hf" and args.hf_max_batch_size is not None:
- raise ValueError("HF max batch size is only for HF backend.")
-
- if (
- args.backend in {"hf", "mii"}
- and getattr(args, "quantization", None) is not None
- ):
- raise ValueError("Quantization is only for vLLM backend.")
-
- if args.backend == "mii" and args.dtype != "auto":
- raise ValueError("dtype must be auto for MII backend.")
- if args.backend == "mii" and args.n != 1:
- raise ValueError("n must be 1 for MII backend.")
- if args.backend == "mii" and args.tokenizer != args.model:
- raise ValueError("Tokenizer must be the same as the model for MII backend.")
-
- # --data-parallel is not supported currently.
- # https://github.com/vllm-project/vllm/issues/16222
- if args.data_parallel_size > 1:
- raise ValueError(
- "Data parallel is not supported in offline benchmark, "
- "please use benchmark serving instead"
- )
-
-
-def create_argument_parser():
- parser = FlexibleArgumentParser(description="Benchmark the throughput.")
- parser.add_argument(
- "--backend",
- type=str,
- choices=["vllm", "hf", "mii", "vllm-chat"],
- default="vllm",
- )
- parser.add_argument(
- "--dataset-name",
- type=str,
- choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
- help="Name of the dataset to benchmark on.",
- default="sharegpt",
- )
- parser.add_argument(
- "--no-stream",
- action="store_true",
- help="Do not load the dataset in streaming mode.",
- )
- parser.add_argument(
- "--dataset",
- type=str,
- default=None,
- help="Path to the ShareGPT dataset, will be deprecated in\
- the next release. The dataset is expected to "
- "be a json in form of list[dict[..., conversations: "
- "list[dict[..., value: ]]]]",
- )
- parser.add_argument(
- "--dataset-path", type=str, default=None, help="Path to the dataset"
- )
- parser.add_argument(
- "--input-len",
- type=int,
- default=None,
- help="Input prompt length for each request",
- )
- parser.add_argument(
- "--output-len",
- type=int,
- default=None,
- help="Output length for each request. Overrides the "
- "output length from the dataset.",
- )
- parser.add_argument(
- "--n", type=int, default=1, help="Number of generated sequences per prompt."
- )
- parser.add_argument(
- "--num-prompts", type=int, default=1000, help="Number of prompts to process."
- )
- parser.add_argument(
- "--hf-max-batch-size",
- type=int,
- default=None,
- help="Maximum batch size for HF backend.",
- )
- parser.add_argument(
- "--output-json",
- type=str,
- default=None,
- help="Path to save the throughput results in JSON format.",
- )
- parser.add_argument(
- "--async-engine",
- action="store_true",
- default=False,
- help="Use vLLM async engine rather than LLM class.",
- )
- parser.add_argument(
- "--disable-frontend-multiprocessing",
- action="store_true",
- default=False,
- help="Disable decoupled async engine frontend.",
- )
- parser.add_argument(
- "--disable-detokenize",
- action="store_true",
- help=(
- "Do not detokenize the response (i.e. do not include "
- "detokenization time in the measurement)"
- ),
- )
- # LoRA
- parser.add_argument(
- "--lora-path",
- type=str,
- default=None,
- help="Path to the LoRA adapters to use. This can be an absolute path, "
- "a relative path, or a Hugging Face model identifier.",
- )
- parser.add_argument(
- "--prefix-len",
- type=int,
- default=None,
- help=f"Number of prefix tokens to be used in RandomDataset "
- "and SonnetDataset. For RandomDataset, the total input "
- "length is the sum of prefix-len (default: "
- f"{RandomDataset.DEFAULT_PREFIX_LEN}) and a random context length "
- "sampled from [input_len * (1 - range_ratio), "
- "input_len * (1 + range_ratio)]. For SonnetDataset, "
- f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) "
- "controls how much of the input is fixed lines versus "
- "random lines, but the total input length remains approximately "
- "input_len tokens.",
- )
- # random dataset
- parser.add_argument(
- "--random-range-ratio",
- type=float,
- default=None,
- help=f"Range ratio (default : {RandomDataset.DEFAULT_RANGE_RATIO}) "
- "for sampling input/output length, "
- "used only for RandomDataset. Must be in the range [0, 1) to "
- "define a symmetric sampling range "
- "[length * (1 - range_ratio), length * (1 + range_ratio)].",
- )
-
- # hf dtaset
- parser.add_argument(
- "--hf-subset", type=str, default=None, help="Subset of the HF dataset."
- )
- parser.add_argument(
- "--hf-split", type=str, default=None, help="Split of the HF dataset."
- )
-
- parser = AsyncEngineArgs.add_cli_args(parser)
-
- return parser
-
+import sys
if __name__ == "__main__":
- parser = create_argument_parser()
- args = parser.parse_args()
- if args.tokenizer is None:
- args.tokenizer = args.model
- validate_args(args)
- main(args)
+ print("""DEPRECATED: This script has been moved to the vLLM CLI.
+
+Please use the following command instead:
+ vllm bench throughput
+
+For help with the new command, run:
+ vllm bench throughput --help
+
+Alternatively, you can run the new command directly with:
+ python -m vllm.entrypoints.cli.main bench throughput --help
+""")
+ sys.exit(1)
diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
index 92f97ffab..2c72941cf 100644
--- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@@ -62,7 +62,7 @@ benchmark() {
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
--kv-transfer-config \
- '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
CUDA_VISIBLE_DEVICES=1 python3 \
@@ -72,7 +72,7 @@ benchmark() {
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
--kv-transfer-config \
- '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
wait_for_server 8100
wait_for_server 8200
diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
index af2bcba3e..0bbf7cd2b 100644
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@@ -69,7 +69,7 @@ launch_disagg_prefill() {
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
--kv-transfer-config \
- '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
CUDA_VISIBLE_DEVICES=1 python3 \
-m vllm.entrypoints.openai.api_server \
@@ -78,7 +78,7 @@ launch_disagg_prefill() {
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
--kv-transfer-config \
- '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
wait_for_server 8100
wait_for_server 8200
diff --git a/benchmarks/kernels/bench_block_fp8_gemm.py b/benchmarks/kernels/bench_block_fp8_gemm.py
new file mode 100644
index 000000000..f1e504499
--- /dev/null
+++ b/benchmarks/kernels/bench_block_fp8_gemm.py
@@ -0,0 +1,145 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+ apply_w8a8_block_fp8_linear,
+)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+ CUTLASS_BLOCK_FP8_SUPPORTED,
+)
+from vllm.platforms import current_platform
+from vllm.triton_utils import triton as vllm_triton
+
+assert current_platform.is_cuda(), (
+ "Only support benchmarking w8a8 block fp8 kernel on CUDA device."
+)
+
+# DeepSeek-V3 weight shapes
+DEEPSEEK_V3_SHAPES = [
+ (512 + 64, 7168),
+ (2112, 7168),
+ ((128 + 64) * 128, 7168),
+ (128 * (128 + 128), 512),
+ (7168, 16384),
+ (7168, 18432),
+ (18432 * 2, 7168),
+ (24576, 1536),
+ (12288, 7168),
+ (4096, 7168),
+ (7168, 2048),
+]
+
+
+def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
+ """Build runner function for w8a8 block fp8 matmul."""
+ factor_for_scale = 1e-2
+
+ fp8_info = torch.finfo(torch.float8_e4m3fn)
+ fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+ # Create random FP8 tensors
+ A_ref = (torch.rand(M, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
+
+ B_ref = (torch.rand(N, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
+ B = B_ref.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+ # Create scales
+ block_n, block_k = block_size[0], block_size[1]
+ n_tiles = (N + block_n - 1) // block_n
+ k_tiles = (K + block_k - 1) // block_k
+
+ Bs = (
+ torch.rand(n_tiles, k_tiles, dtype=torch.float32, device=device)
+ * factor_for_scale
+ )
+
+ # SM90 CUTLASS requires row-major format for scales
+ if use_cutlass and current_platform.is_device_capability(90):
+ Bs = Bs.T.contiguous()
+
+ def run():
+ if use_cutlass:
+ return apply_w8a8_block_fp8_linear(
+ A_ref, B, block_size, Bs, cutlass_block_fp8_supported=True
+ )
+ else:
+ return apply_w8a8_block_fp8_linear(
+ A_ref, B, block_size, Bs, cutlass_block_fp8_supported=False
+ )
+
+ return run
+
+
+# Determine available providers
+available_providers = ["torch-bf16", "w8a8-block-fp8-triton"]
+plot_title = "BF16 vs W8A8 Block FP8 GEMMs"
+
+if CUTLASS_BLOCK_FP8_SUPPORTED:
+ available_providers.append("w8a8-block-fp8-cutlass")
+
+
+@vllm_triton.testing.perf_report(
+ vllm_triton.testing.Benchmark(
+ x_names=["batch_size"],
+ x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
+ x_log=False,
+ line_arg="provider",
+ line_vals=available_providers,
+ line_names=available_providers,
+ ylabel="TFLOP/s (larger is better)",
+ plot_name="BF16 vs W8A8 Block FP8 GEMMs",
+ args={},
+ )
+)
+def benchmark_tflops(batch_size, provider, N, K, block_size=(128, 128)):
+ M = batch_size
+ device = "cuda"
+
+ quantiles = [0.5, 0.2, 0.8]
+
+ if provider == "torch-bf16":
+ a = torch.randn((M, K), device=device, dtype=torch.bfloat16)
+ b = torch.randn((N, K), device=device, dtype=torch.bfloat16)
+ ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
+ lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
+ )
+ elif provider == "w8a8-block-fp8-triton":
+ run_w8a8_triton = build_w8a8_block_fp8_runner(
+ M, N, K, block_size, device, use_cutlass=False
+ )
+ ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
+ lambda: run_w8a8_triton(), quantiles=quantiles
+ )
+ elif provider == "w8a8-block-fp8-cutlass":
+ run_w8a8_cutlass = build_w8a8_block_fp8_runner(
+ M, N, K, block_size, device, use_cutlass=True
+ )
+ ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
+ lambda: run_w8a8_cutlass(), quantiles=quantiles
+ )
+ else:
+ raise ValueError(f"Unknown provider: {provider}")
+
+ to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
+ return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
+
+
+if __name__ == "__main__":
+ block_size = (128, 128)
+
+ for N, K in DEEPSEEK_V3_SHAPES:
+ print(f"\nBenchmarking DeepSeek-V3, N={N} K={K}")
+
+ print(f"TFLOP/s comparison (block_size={block_size}):")
+ benchmark_tflops.run(
+ print_data=True,
+ # show_plots=False,
+ # save_path=f"bench_w8a8_block_fp8_tflops_n{N}_k{K}",
+ N=N,
+ K=K,
+ block_size=block_size,
+ )
+
+ print("\nBenchmark finished!")
diff --git a/benchmarks/kernels/benchmark_activation.py b/benchmarks/kernels/benchmark_activation.py
new file mode 100644
index 000000000..93edbcc93
--- /dev/null
+++ b/benchmarks/kernels/benchmark_activation.py
@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# benchmark custom activation op performance
+import itertools
+
+import torch
+
+import vllm.model_executor.layers.activation # noqa F401
+from vllm.model_executor.custom_op import CustomOp
+from vllm.platforms import current_platform
+from vllm.triton_utils import triton
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+
+batch_size_range = [1, 16, 32, 64, 128]
+seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
+intermediate_size = [3072, 9728, 12288]
+configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size))
+
+
+def benchmark_activation(
+ batch_size: int,
+ seq_len: int,
+ intermediate_size: int,
+ provider: str,
+ func_name: str,
+ dtype: torch.dtype,
+):
+ device = "cuda"
+ num_tokens = batch_size * seq_len
+ dim = intermediate_size
+ current_platform.seed_everything(42)
+ torch.set_default_device(device)
+
+ if func_name == "gelu_and_mul":
+ layer = CustomOp.op_registry[func_name](approximate="none")
+ elif func_name == "gelu_and_mul_tanh":
+ layer = CustomOp.op_registry["gelu_and_mul"](approximate="tanh")
+ elif func_name == "fatrelu_and_mul":
+ threshold = 0.5
+ layer = CustomOp.op_registry[func_name](threshold)
+ else:
+ layer = CustomOp.op_registry[func_name]()
+
+ x = torch.randn(num_tokens, dim, dtype=dtype, device=device)
+ compiled_layer = torch.compile(layer.forward_native)
+
+ if provider == "custom":
+ fn = lambda: layer(x)
+ elif provider == "compiled":
+ fn = lambda: compiled_layer(x)
+
+ ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+ fn, quantiles=[0.5, 0.2, 0.8]
+ )
+ return ms, max_ms, min_ms
+
+
+if __name__ == "__main__":
+ parser = FlexibleArgumentParser(description="Benchmark the custom activation op.")
+ parser.add_argument(
+ "--func-name",
+ type=str,
+ choices=[
+ "mul_and_silu",
+ "silu_and_mul",
+ "gelu_and_mul",
+ "gelu_and_mul_tanh",
+ "fatrelu_and_mul",
+ "swigluoai_and_mul",
+ "gelu_new",
+ "gelu_fast",
+ "quick_gelu",
+ ],
+ default="silu_and_mul",
+ )
+ parser.add_argument(
+ "--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16"
+ )
+ args = parser.parse_args()
+ assert args
+
+ func_name = args.func_name
+ dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]
+
+ perf_report = triton.testing.perf_report(
+ triton.testing.Benchmark(
+ x_names=["batch_size", "seq_len", "intermediate_size"],
+ x_vals=configs,
+ line_arg="provider",
+ line_vals=["custom", "compiled"],
+ line_names=["Custom OP", "Compiled"],
+ styles=[("blue", "-"), ("green", "-")],
+ ylabel="ms",
+ plot_name=f"{func_name}-op-performance",
+ args={},
+ )
+ )
+
+ perf_report(
+ lambda batch_size, seq_len, intermediate_size, provider: benchmark_activation(
+ batch_size, seq_len, intermediate_size, provider, func_name, dtype
+ )
+ ).run(print_data=True)
diff --git a/benchmarks/kernels/benchmark_device_communicators.py b/benchmarks/kernels/benchmark_device_communicators.py
new file mode 100644
index 000000000..a61c17edc
--- /dev/null
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@@ -0,0 +1,486 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Benchmark script for device communicators:
+CustomAllreduce (oneshot, twoshot), PyNcclCommunicator,
+and SymmMemCommunicator (multimem, two-shot).
+
+Usage:
+ torchrun --nproc_per_node= benchmark_device_communicators.py [options]
+
+Example:
+ torchrun --nproc_per_node=2 benchmark_device_communicators.py
+ --sequence-lengths 512 1024 2048 --num-warmup 10 --num-trials 100
+"""
+
+import json
+import os
+import time
+from contextlib import nullcontext
+from typing import Callable, Optional
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
+from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
+from vllm.logger import init_logger
+from vllm.utils import FlexibleArgumentParser
+
+logger = init_logger(__name__)
+
+# Default sequence lengths to benchmark
+DEFAULT_SEQUENCE_LENGTHS = [128, 512, 1024, 2048, 4096, 8192]
+
+# Fixed hidden size and dtype for all benchmarks
+HIDDEN_SIZE = 8192
+BENCHMARK_DTYPE = torch.bfloat16
+
+# CUDA graph settings
+CUDA_GRAPH_CAPTURE_CYCLES = 10
+
+
+class CommunicatorBenchmark:
+ """Benchmark class for testing device communicators."""
+
+ def __init__(
+ self,
+ rank: int,
+ world_size: int,
+ device: torch.device,
+ cpu_group: ProcessGroup,
+ sequence_lengths: list[int],
+ ):
+ self.rank = rank
+ self.world_size = world_size
+ self.device = device
+ self.cpu_group = cpu_group
+
+ # Calculate max_size_override based on largest sequence length
+ max_seq_len = max(sequence_lengths)
+ max_tensor_elements = max_seq_len * HIDDEN_SIZE
+ self.max_size_override = max_tensor_elements * BENCHMARK_DTYPE.itemsize + 1
+
+ # Initialize communicators
+ self.custom_allreduce = None
+ self.pynccl_comm = None
+ self.symm_mem_comm = None
+ self.symm_mem_comm_multimem = None
+ self.symm_mem_comm_two_shot = None
+
+ self._init_communicators()
+
+ def _init_communicators(self):
+ """Initialize all available communicators."""
+ try:
+ self.custom_allreduce = CustomAllreduce(
+ group=self.cpu_group,
+ device=self.device,
+ max_size=self.max_size_override,
+ )
+ if not self.custom_allreduce.disabled:
+ logger.info("Rank %s: CustomAllreduce initialized", self.rank)
+ else:
+ logger.info("Rank %s: CustomAllreduce disabled", self.rank)
+ except Exception as e:
+ logger.warning(
+ "Rank %s: Failed to initialize CustomAllreduce: %s", self.rank, e
+ )
+ self.custom_allreduce = None
+
+ try:
+ self.pynccl_comm = PyNcclCommunicator(
+ group=self.cpu_group, device=self.device
+ )
+ if not self.pynccl_comm.disabled:
+ logger.info("Rank %s: PyNcclCommunicator initialized", self.rank)
+ else:
+ logger.info("Rank %s: PyNcclCommunicator disabled", self.rank)
+ self.pynccl_comm = None
+ except Exception as e:
+ logger.warning(
+ "Rank %s: Failed to initialize PyNcclCommunicator: %s", self.rank, e
+ )
+ self.pynccl_comm = None
+
+ # Initialize variants for SymmMemCommunicator
+ try:
+ self.symm_mem_comm_multimem = SymmMemCommunicator(
+ group=self.cpu_group,
+ device=self.device,
+ force_multimem=True,
+ max_size_override=self.max_size_override,
+ )
+ if not self.symm_mem_comm_multimem.disabled:
+ logger.info(
+ "Rank %s: SymmMemCommunicator (multimem) initialized", self.rank
+ )
+ else:
+ self.symm_mem_comm_multimem = None
+ except Exception as e:
+ logger.warning(
+ "Rank %s: Failed to initialize SymmMemCommunicator (multimem): %s",
+ self.rank,
+ e,
+ )
+ self.symm_mem_comm_multimem = None
+
+ try:
+ self.symm_mem_comm_two_shot = SymmMemCommunicator(
+ group=self.cpu_group,
+ device=self.device,
+ force_multimem=False,
+ max_size_override=self.max_size_override,
+ )
+ if not self.symm_mem_comm_two_shot.disabled:
+ logger.info(
+ "Rank %s: SymmMemCommunicator (two_shot) initialized", self.rank
+ )
+ else:
+ self.symm_mem_comm_two_shot = None
+ except Exception as e:
+ logger.warning(
+ "Rank %s: Failed to initialize SymmMemCommunicator (two_shot): %s",
+ self.rank,
+ e,
+ )
+ self.symm_mem_comm_two_shot = None
+
+ def benchmark_allreduce(
+ self, sequence_length: int, num_warmup: int, num_trials: int
+ ) -> dict[str, float]:
+ """Benchmark allreduce operations for all available communicators."""
+
+ results = {}
+
+ # Define communicators with their benchmark functions
+ communicators = []
+
+ if self.custom_allreduce is not None:
+ comm = self.custom_allreduce
+ # CustomAllreduce one-shot
+ communicators.append(
+ (
+ "ca_1stage",
+ lambda t, c=comm: c.custom_all_reduce(t),
+ lambda t, c=comm: c.should_custom_ar(t),
+ comm.capture(),
+ "1stage", # env variable value
+ )
+ )
+ # CustomAllreduce two-shot
+ communicators.append(
+ (
+ "ca_2stage",
+ lambda t, c=comm: c.custom_all_reduce(t),
+ lambda t, c=comm: c.should_custom_ar(t),
+ comm.capture(),
+ "2stage", # env variable value
+ )
+ )
+
+ if self.pynccl_comm is not None:
+ comm = self.pynccl_comm
+ communicators.append(
+ (
+ "pynccl",
+ lambda t, c=comm: c.all_reduce(t),
+ lambda t: True, # Always available if initialized
+ nullcontext(),
+ None, # no env variable needed
+ )
+ )
+
+ if self.symm_mem_comm_multimem is not None:
+ comm = self.symm_mem_comm_multimem
+ communicators.append(
+ (
+ "symm_mem_multimem",
+ lambda t, c=comm: c.all_reduce(t),
+ lambda t, c=comm: c.should_use_symm_mem(t),
+ nullcontext(),
+ None, # no env variable needed
+ )
+ )
+
+ if self.symm_mem_comm_two_shot is not None:
+ comm = self.symm_mem_comm_two_shot
+ communicators.append(
+ (
+ "symm_mem_two_shot",
+ lambda t, c=comm: c.all_reduce(t),
+ lambda t, c=comm: c.should_use_symm_mem(t),
+ nullcontext(),
+ None, # no env variable needed
+ )
+ )
+
+ # Benchmark each communicator
+ for name, allreduce_fn, should_use_fn, context, env_var in communicators:
+ # Set environment variable if needed
+ if env_var is not None:
+ os.environ["VLLM_CUSTOM_ALLREDUCE_ALGO"] = env_var
+ else:
+ # Clear the environment variable to avoid interference
+ os.environ.pop("VLLM_CUSTOM_ALLREDUCE_ALGO", None)
+
+ latency = self.benchmark_allreduce_single(
+ sequence_length,
+ allreduce_fn,
+ should_use_fn,
+ context,
+ num_warmup,
+ num_trials,
+ )
+ if latency is not None:
+ results[name] = latency
+
+ return results
+
+ def benchmark_allreduce_single(
+ self,
+ sequence_length: int,
+ allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
+ should_use_fn: Callable[[torch.Tensor], bool],
+ context,
+ num_warmup: int,
+ num_trials: int,
+ ) -> Optional[float]:
+ """Benchmark method with CUDA graph optimization."""
+ try:
+ # Create test tensor (2D: sequence_length x hidden_size)
+ tensor = torch.randn(
+ sequence_length, HIDDEN_SIZE, dtype=BENCHMARK_DTYPE, device=self.device
+ )
+ if not should_use_fn(tensor):
+ return None
+
+ torch.cuda.synchronize()
+ stream = torch.cuda.Stream()
+ with torch.cuda.stream(stream):
+ graph_input = tensor.clone()
+
+ # Warmup before capture
+ for _ in range(3):
+ allreduce_fn(graph_input)
+
+ # Capture the graph using context manager
+ with context:
+ graph = torch.cuda.CUDAGraph()
+ with torch.cuda.graph(graph):
+ for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
+ allreduce_fn(graph_input)
+
+ torch.cuda.synchronize()
+ for _ in range(num_warmup):
+ graph.replay()
+ torch.cuda.synchronize()
+
+ torch.cuda.synchronize()
+ start_time = time.perf_counter()
+
+ for _ in range(num_trials):
+ graph.replay()
+ torch.cuda.synchronize()
+
+ end_time = time.perf_counter()
+
+ # Convert to ms and divide by CUDA_GRAPH_CAPTURE_CYCLES
+ return (
+ (end_time - start_time) / num_trials / CUDA_GRAPH_CAPTURE_CYCLES * 1000
+ )
+
+ except Exception as e:
+ logger.error("CUDA graph benchmark failed: %s", e)
+ raise RuntimeError(
+ f"CUDA graph benchmark failed for communicator: {e}"
+ ) from e
+
+
+def _calculate_speedup_info(comm_results: dict[str, float]) -> str:
+ """Calculate speedup information for a single tensor size."""
+ if not comm_results:
+ return "N/A"
+
+ # Find the fastest communicator
+ fastest_comm = min(comm_results.keys(), key=lambda k: comm_results[k])
+ fastest_time = comm_results[fastest_comm]
+
+ # Calculate speedup vs PyNccl if available
+ if "pynccl" in comm_results:
+ pynccl_time = comm_results["pynccl"]
+ speedup = pynccl_time / fastest_time
+ return f"{fastest_comm} ({speedup:.2f}x)"
+ else:
+ return f"{fastest_comm} (N/A)"
+
+
+def print_results(
+ results: dict[str, dict[str, float]], sequence_lengths: list[int], world_size: int
+):
+ """Print benchmark results in a formatted table."""
+
+ print(f"\n{'=' * 130}")
+ print("Device Communicator Benchmark Results")
+ print(
+ f"World Size: {world_size}, Data Type: {BENCHMARK_DTYPE}, "
+ f"Hidden Size: {HIDDEN_SIZE}"
+ )
+ print(f"{'=' * 130}")
+
+ # Get all communicator names
+ all_comms = set()
+ for size_results in results.values():
+ all_comms.update(size_results.keys())
+
+ all_comms = sorted(list(all_comms))
+
+ # Print header
+ header = f"{'Tensor Shape':<20}{'Tensor Size':<15}"
+ for comm in all_comms:
+ header += f"{comm:<20}"
+ header += f"{'Best (Speedup vs PyNccl)':<30}"
+ print(header)
+ print("-" * len(header))
+
+ # Print results for each sequence length
+ for seq_len in sequence_lengths:
+ if seq_len in results:
+ # Calculate tensor size in elements and bytes
+ tensor_elements = seq_len * HIDDEN_SIZE
+ tensor_bytes = tensor_elements * BENCHMARK_DTYPE.itemsize
+
+ # Format tensor size (MB)
+ tensor_size_mb = tensor_bytes / (1024 * 1024)
+ tensor_size_str = f"{tensor_size_mb:.2f} MB"
+
+ # Format tensor shape
+ tensor_shape = f"({seq_len}, {HIDDEN_SIZE})"
+
+ row = f"{tensor_shape:<20}{tensor_size_str:<15}"
+ for comm in all_comms:
+ if comm in results[seq_len]:
+ row += f"{results[seq_len][comm]:<20.3f}"
+ else:
+ row += f"{'N/A':<20}"
+
+ # Calculate speedup information
+ speedup_info = _calculate_speedup_info(results[seq_len])
+ row += f"{speedup_info:<30}"
+
+ print(row)
+
+ print(f"{'=' * 130}")
+ print("All times are in milliseconds (ms) per allreduce operation")
+ print("Speedup column shows: fastest_algorithm (speedup_vs_pynccl)")
+
+
+def main():
+ parser = FlexibleArgumentParser(description="Benchmark device communicators")
+
+ parser.add_argument(
+ "--sequence-lengths",
+ type=int,
+ nargs="+",
+ default=DEFAULT_SEQUENCE_LENGTHS,
+ help="Sequence lengths to benchmark (tensor shape: seq_len x hidden_size)",
+ )
+
+ parser.add_argument(
+ "--num-warmup", type=int, default=5, help="Number of warmup iterations"
+ )
+
+ parser.add_argument(
+ "--num-trials", type=int, default=50, help="Number of benchmark trials"
+ )
+
+ parser.add_argument("--output-json", type=str, help="Output results to JSON file")
+
+ args = parser.parse_args()
+
+ # Initialize distributed
+ if not dist.is_initialized():
+ dist.init_process_group(backend="gloo")
+ rank = dist.get_rank()
+ world_size = dist.get_world_size()
+
+ # Set device
+ device = torch.device(f"cuda:{rank}")
+ torch.cuda.set_device(device)
+
+ # Get CPU process group
+ cpu_group = dist.new_group(backend="gloo")
+
+ # Disable USE_SYMM_MEM to avoid affecting the max_sizes
+ # in symm_mem and custom_all_reduce for benchmark
+ os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
+
+ # Initialize benchmark
+ benchmark = CommunicatorBenchmark(
+ rank, world_size, device, cpu_group, args.sequence_lengths
+ )
+
+ # Run benchmarks
+ all_results = {}
+
+ for seq_len in args.sequence_lengths:
+ if rank == 0:
+ logger.info(
+ "Benchmarking sequence length: %s (tensor shape: %s x %s)",
+ seq_len,
+ seq_len,
+ HIDDEN_SIZE,
+ )
+
+ results = benchmark.benchmark_allreduce(
+ sequence_length=seq_len,
+ num_warmup=args.num_warmup,
+ num_trials=args.num_trials,
+ )
+
+ all_results[seq_len] = results
+
+ # Synchronize between ranks
+ dist.barrier()
+
+ # Print results (only rank 0)
+ if rank == 0:
+ print_results(all_results, args.sequence_lengths, world_size)
+
+ # Save to JSON if requested
+ if args.output_json:
+ # Add speedup information to results
+ enhanced_results = {}
+ for seq_len, comm_results in all_results.items():
+ enhanced_results[seq_len] = {
+ "timings": comm_results,
+ "speedup_info": _calculate_speedup_info(comm_results),
+ }
+
+ output_data = {
+ "world_size": world_size,
+ "dtype": str(BENCHMARK_DTYPE),
+ "hidden_size": HIDDEN_SIZE,
+ "sequence_lengths": args.sequence_lengths,
+ "num_warmup": args.num_warmup,
+ "num_trials": args.num_trials,
+ "cuda_graph_capture_cycles": CUDA_GRAPH_CAPTURE_CYCLES,
+ "results": enhanced_results,
+ }
+
+ with open(args.output_json, "w") as f:
+ json.dump(output_data, f, indent=2)
+
+ logger.info("Results saved to %s", args.output_json)
+
+ # Cleanup
+ if cpu_group != dist.group.WORLD:
+ dist.destroy_process_group(cpu_group)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py
index 3d38d4b35..89309c79f 100644
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -637,7 +637,7 @@ def bench_optype(
# Clear LoRA optimization hash-maps.
_LORA_A_PTR_DICT.clear()
_LORA_B_PTR_DICT.clear()
- # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are setup
+ # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
for kwargs in kwargs_list:
op_type.bench_fn()(**kwargs)
torch.cuda.synchronize()
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 752c2d008..94f3f1ae1 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -419,8 +419,10 @@ class BenchmarkWorker:
)
# NOTE(woosuk): The current naming convention uses w2.shape[2], which
# is the intermediate size after silu_and_mul.
+ block_n = block_quant_shape[0] if block_quant_shape else None
+ block_k = block_quant_shape[1] if block_quant_shape else None
op_config = get_moe_configs(
- num_experts, shard_intermediate_size // 2, dtype_str
+ num_experts, shard_intermediate_size // 2, dtype_str, block_n, block_k
)
if op_config is None:
config = get_default_config(
@@ -430,6 +432,7 @@ class BenchmarkWorker:
hidden_size,
topk,
dtype_str,
+ block_quant_shape,
)
else:
config = op_config[min(op_config.keys(), key=lambda x: abs(x - num_tokens))]
@@ -591,7 +594,11 @@ def main(args: argparse.Namespace):
E = config.n_routed_experts
topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size
- elif config.architectures[0] in ("Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"):
+ elif config.architectures[0] in (
+ "Qwen2MoeForCausalLM",
+ "Qwen3MoeForCausalLM",
+ "Qwen3NextForCausalLM",
+ ):
E = config.num_experts
topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size
@@ -675,7 +682,11 @@ def main(args: argparse.Namespace):
is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
search_space = get_configs_compute_bound(is_fp16, block_quant_shape)
print(f"Start tuning over {len(search_space)} configurations...")
-
+ if use_deep_gemm:
+ raise ValueError(
+ "Tuning with --use-deep-gemm is not supported as it only tunes Triton "
+ "kernels. Please remove the flag."
+ )
start = time.time()
configs = _distribute(
"tune",
diff --git a/benchmarks/kernels/benchmark_polynorm.py b/benchmarks/kernels/benchmark_polynorm.py
new file mode 100644
index 000000000..9ac8f5e65
--- /dev/null
+++ b/benchmarks/kernels/benchmark_polynorm.py
@@ -0,0 +1,155 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import itertools
+
+import torch
+
+from vllm import _custom_ops as vllm_ops
+from vllm.triton_utils import triton
+
+
+def polynorm_naive(
+ x: torch.Tensor,
+ weight: torch.Tensor,
+ bias: torch.Tensor,
+ eps: float = 1e-6,
+):
+ orig_shape = x.shape
+ x = x.view(-1, x.shape[-1])
+
+ def norm(x, eps: float):
+ return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps)
+
+ x = x.float()
+ return (
+ (
+ weight[0] * norm(x**3, eps)
+ + weight[1] * norm(x**2, eps)
+ + weight[2] * norm(x, eps)
+ + bias
+ )
+ .to(weight.dtype)
+ .view(orig_shape)
+ )
+
+
+def polynorm_vllm(
+ x: torch.Tensor,
+ weight: torch.Tensor,
+ bias: torch.Tensor,
+ eps: float = 1e-6,
+):
+ orig_shape = x.shape
+ x = x.view(-1, x.shape[-1])
+
+ out = torch.empty_like(x)
+ vllm_ops.poly_norm(out, x, weight, bias, eps)
+ output = out
+
+ output = output.view(orig_shape)
+ return output
+
+
+def calculate_diff(batch_size, seq_len, hidden_dim):
+ dtype = torch.bfloat16
+ x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
+ weight = torch.ones(3, dtype=dtype, device="cuda")
+ bias = torch.ones(1, dtype=dtype, device="cuda")
+
+ output_naive = polynorm_naive(x, weight, bias)
+ output_vllm = polynorm_vllm(x, weight, bias)
+
+ if torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2):
+ print("✅ All implementations match")
+ else:
+ print("❌ Implementations differ")
+
+
+batch_size_range = [2**i for i in range(0, 7, 2)]
+seq_length_range = [2**i for i in range(6, 11, 1)]
+dim_range = [2048, 4096]
+configs = list(itertools.product(dim_range, batch_size_range, seq_length_range))
+
+
+def get_benchmark():
+ @triton.testing.perf_report(
+ triton.testing.Benchmark(
+ x_names=["dim", "batch_size", "seq_len"],
+ x_vals=[list(_) for _ in configs],
+ line_arg="provider",
+ line_vals=["naive", "vllm"],
+ line_names=["Naive", "vLLM"],
+ styles=[("blue", "-"), ("red", "-")],
+ ylabel="us",
+ plot_name="polynorm-perf",
+ args={},
+ )
+ )
+ def benchmark(dim, batch_size, seq_len, provider):
+ dtype = torch.bfloat16
+ hidden_dim = dim * 4
+
+ x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
+ weight = torch.ones(3, dtype=dtype, device="cuda")
+ bias = torch.ones(1, dtype=dtype, device="cuda")
+
+ quantiles = [0.5, 0.2, 0.8]
+
+ if provider == "naive":
+ ms, min_ms, max_ms = triton.testing.do_bench(
+ lambda: polynorm_naive(x, weight, bias),
+ quantiles=quantiles,
+ )
+ else:
+ ms, min_ms, max_ms = triton.testing.do_bench(
+ lambda: polynorm_vllm(x, weight, bias),
+ quantiles=quantiles,
+ )
+
+ return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+ return benchmark
+
+
+if __name__ == "__main__":
+ import argparse
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--batch-size",
+ type=int,
+ default=4,
+ help="Batch size",
+ )
+ parser.add_argument(
+ "--seq-len",
+ type=int,
+ default=128,
+ help="Sequence length",
+ )
+ parser.add_argument(
+ "--hidden-dim",
+ type=int,
+ default=8192,
+ help="Intermediate size of MLP",
+ )
+ parser.add_argument(
+ "--save-path",
+ type=str,
+ default="./configs/polnorm/",
+ help="Path to save polnorm benchmark results",
+ )
+
+ args = parser.parse_args()
+
+ # Run correctness test
+ calculate_diff(
+ batch_size=args.batch_size,
+ seq_len=args.seq_len,
+ hidden_dim=args.hidden_dim,
+ )
+
+ benchmark = get_benchmark()
+ # Run performance benchmark
+ benchmark.run(print_data=True, save_path=args.save_path)
diff --git a/benchmarks/kernels/benchmark_trtllm_decode_attention.py b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
index 603ce5ecf..6ddab4621 100644
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@@ -259,6 +259,7 @@ if __name__ == "__main__":
# (q_quant_dtype, kv_quant_dtype, o_quant_dtype)
(None, None, None),
(None, FP8_DTYPE, None),
+ (FP8_DTYPE, FP8_DTYPE, None),
(FP8_DTYPE, FP8_DTYPE, FP8_DTYPE),
(FP8_DTYPE, FP8_DTYPE, FP4_DTYPE),
]
diff --git a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
index 40903c6c3..131df74c7 100644
--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@@ -274,6 +274,7 @@ if __name__ == "__main__":
quant_dtypes = [
# (q_quant_dtype, kv_quant_dtype, o_quant_dtype)
(None, None, None),
+ (FP8_DTYPE, FP8_DTYPE, None),
(FP8_DTYPE, FP8_DTYPE, FP8_DTYPE),
(FP8_DTYPE, FP8_DTYPE, FP4_DTYPE),
]
diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
index e648a9107..df2b713e4 100644
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -56,7 +56,7 @@ def w8a8_block_matmul(
Bs: The per-block quantization scale for `B`.
block_size: The block size for per-block quantization.
It should be 2-dim, e.g., [128, 128].
- output_dytpe: The dtype of the returned tensor.
+ output_dtype: The dtype of the returned tensor.
Returns:
torch.Tensor: The result of matmul.
@@ -141,6 +141,7 @@ def get_weight_shapes(tp_size):
# cannot TP
total = [
(512 + 64, 7168),
+ (2112, 7168),
((128 + 64) * 128, 7168),
(128 * (128 + 128), 512),
(7168, 16384),
diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
index d23b7b6e4..66d85eaf5 100644
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@@ -962,7 +962,7 @@ async def main_mp(
# At this point all the clients finished,
# collect results (TTFT, TPOT, etc.) from all the clients.
- # This needs to happens before calling join on the clients
+ # This needs to happen before calling join on the clients
# (result_queue should be emptied).
while not result_queue.empty():
client_metrics.append(result_queue.get())
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 52bfd82c7..064944632 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -88,6 +88,7 @@ is_avx512_disabled(AVX512_DISABLED)
if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
message(STATUS "Apple Silicon Detected")
+ set(APPLE_SILICON_FOUND TRUE)
set(ENABLE_NUMA OFF)
check_sysctl(hw.optional.neon ASIMD_FOUND)
check_sysctl(hw.optional.arm.FEAT_BF16 ARM_BF16_FOUND)
@@ -189,7 +190,7 @@ else()
set(USE_ACL OFF)
endif()
-if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
+if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
FetchContent_Declare(
oneDNN
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index 49defccbb..3d32121f1 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
- GIT_TAG 57b4e68b9f9d94750b46de8f8dbd2bfcc86edd4f
+ GIT_TAG ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
index 6dd6f269f..c60f1823b 100644
--- a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
+++ b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
@@ -36,6 +36,7 @@ limitations under the License.
#if !defined(CUDA_VERSION) || CUDA_VERSION < 12040
void sm100_cutlass_mla_decode(
torch::Tensor const& out,
+ torch::Tensor const& lse,
torch::Tensor const& q_nope,
torch::Tensor const& q_pe,
torch::Tensor const& kv_c_and_k_pe_cache,
@@ -64,11 +65,11 @@ struct IsPersistent {
static const bool value = v;
};
-template >
+template >
struct MlaSm100 {
using Element = T;
using ElementAcc = float;
- using ElementOut = T;
+ using ElementOut = TOut;
using TileShape = Shape<_128, _128, Shape<_512, _64>>;
using TileShapeH = cute::tuple_element_t<0, TileShape>;
@@ -99,6 +100,7 @@ struct MlaSm100 {
template
typename T::Fmha::Arguments args_from_options(
at::Tensor const& out,
+ at::Tensor const& lse,
at::Tensor const& q_nope,
at::Tensor const& q_pe,
at::Tensor const& kv_c_and_k_pe_cache,
@@ -162,7 +164,10 @@ typename T::Fmha::Arguments args_from_options(
stride_PT,
page_count_total,
page_size},
- {static_cast(out.data_ptr()), stride_O, static_cast(nullptr), stride_LSE},
+ {static_cast(out.data_ptr()),
+ stride_O,
+ static_cast(lse.defined() ? lse.data_ptr() : nullptr),
+ stride_LSE},
hw_info,
// TODO(trevor-m): Change split_kv back to -1 when
// https://github.com/NVIDIA/cutlass/issues/2274 is fixed. Split_kv=1 will
@@ -178,9 +183,10 @@ typename T::Fmha::Arguments args_from_options(
return arguments;
}
-template
+template
void runMla(
at::Tensor const& out,
+ at::Tensor const& lse,
at::Tensor const& q_nope,
at::Tensor const& q_pe,
at::Tensor const& kv_c_and_k_pe_cache,
@@ -190,9 +196,9 @@ void runMla(
double sm_scale,
int64_t num_kv_splits,
cudaStream_t stream) {
- using MlaSm100Type = MlaSm100;
+ using MlaSm100Type = MlaSm100;
typename MlaSm100Type::Fmha fmha;
- auto arguments = args_from_options(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits);
+ auto arguments = args_from_options(out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits);
CUTLASS_CHECK(fmha.can_implement(arguments));
@@ -214,6 +220,7 @@ void runMla(
void sm100_cutlass_mla_decode(
torch::Tensor const& out,
+ torch::Tensor const& lse,
torch::Tensor const& q_nope,
torch::Tensor const& q_pe,
torch::Tensor const& kv_c_and_k_pe_cache,
@@ -233,14 +240,14 @@ void sm100_cutlass_mla_decode(
DISPATCH_BOOL(page_size == 128, IsPaged128, [&] {
DISPATCH_BOOL(num_kv_splits <= 1, NotManualSplitKV, [&] {
if (in_dtype == at::ScalarType::Half) {
- runMla>(
- out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+ runMla>(
+ out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
} else if (in_dtype == at::ScalarType::BFloat16) {
- runMla>(
- out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+ runMla>(
+ out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
} else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
- runMla>(
- out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+ runMla>(
+ out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
} else {
TORCH_CHECK(false, "Unsupported input data type of MLA");
}
@@ -253,7 +260,7 @@ void sm100_cutlass_mla_decode(
int64_t sm100_cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_batches, int64_t sm_count, int64_t num_kv_splits) {
// Workspace size depends on ElementAcc and ElementLSE (same as ElementAcc)
// which are float, so Element type here doesn't matter.
- using MlaSm100Type = MlaSm100;
+ using MlaSm100Type = MlaSm100;
// Get split kv. Requires problem shape and sm_count only.
typename MlaSm100Type::Fmha::Arguments arguments;
diff --git a/csrc/cache.h b/csrc/cache.h
index fb0c353b9..fd230bec2 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -47,4 +47,12 @@ void gather_and_maybe_dequant_cache(
torch::Tensor const& cu_seq_lens, // [BATCH+1]
int64_t batch_size, const std::string& kv_cache_dtype,
torch::Tensor const& scale,
- std::optional seq_starts = std::nullopt);
\ No newline at end of file
+ std::optional seq_starts = std::nullopt);
+
+// TODO(hc): cp_gather_cache need support scaled kvcahe in the future.
+void cp_gather_cache(
+ torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
+ torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
+ torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
+ torch::Tensor const& cu_seq_lens, // [BATCH+1]
+ int64_t batch_size, std::optional seq_starts = std::nullopt);
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 21381c30d..03db59ec9 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -1,6 +1,7 @@
#include
#include
#include
+#include
#include "cuda_utils.h"
#include "cuda_compat.h"
@@ -779,3 +780,145 @@ void gather_and_maybe_dequant_cache(
DISPATCH_BY_KV_CACHE_DTYPE(dst.dtype(), kv_cache_dtype, CALL_GATHER_CACHE);
}
+
+namespace vllm {
+template
+// Note(hc): The cp_gather_cache allows seq_starts to no longer be divisible by
+// block_size.
+__global__ void cp_gather_cache(
+ const scalar_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE,
+ // ENTRY_SIZE]
+ scalar_t* __restrict__ dst, // [TOT_TOKENS, ENTRY_SIZE]
+ const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES]
+ const int32_t* __restrict__ cu_seq_lens, // [BATCH+1]
+ const int32_t block_size, const int32_t entry_size,
+ const int64_t block_table_stride, const int64_t cache_block_stride,
+ const int64_t cache_entry_stride, const int64_t dst_entry_stride,
+ const int32_t* __restrict__ seq_starts // Optional: starting offsets per
+ // batch
+) {
+ const int64_t bid = blockIdx.x; // Batch ID
+ const int32_t num_splits = gridDim.y;
+ const int32_t split = blockIdx.y;
+ const int32_t seq_start = cu_seq_lens[bid];
+ const int32_t seq_end = cu_seq_lens[bid + 1];
+ const int32_t seq_len = seq_end - seq_start;
+ const int32_t tot_slots = seq_len;
+ const int32_t split_slots = cuda_utils::ceil_div(tot_slots, num_splits);
+
+ const int32_t split_start = split * split_slots;
+ const int32_t split_end = min((split + 1) * split_slots, tot_slots);
+
+ const bool is_active_split = (split_start < tot_slots);
+
+ if (!is_active_split) return;
+
+ // Adjust the pointer for the block_table for this batch.
+ // If seq_starts is provided, compute an offset based on it
+ const int32_t batch_offset = bid * block_table_stride;
+ int32_t offset = split_start;
+ if (seq_starts != nullptr) {
+ offset += seq_starts[bid];
+ }
+ int32_t offset_div = offset / block_size;
+ offset = offset % block_size;
+ const int32_t* batch_block_table = block_table + batch_offset;
+
+ // Adjust dst pointer based on the cumulative sequence lengths.
+ dst += seq_start * dst_entry_stride;
+
+ auto copy_entry = [&](const scalar_t* __restrict__ _src,
+ scalar_t* __restrict__ _dst) {
+ for (int i = threadIdx.x; i < entry_size; i += blockDim.x)
+ _dst[i] = _src[i];
+ };
+
+ for (int pid = split_start; pid < split_end; ++pid) {
+ auto block_id = batch_block_table[offset_div];
+ auto block_start_ptr = src_cache + block_id * cache_block_stride;
+ auto block_dst_ptr = dst + pid * dst_entry_stride;
+ copy_entry(block_start_ptr + offset * cache_entry_stride, block_dst_ptr);
+ offset += 1;
+ // bump to next block
+ if (offset == block_size) {
+ offset_div += 1;
+ offset = 0;
+ }
+ }
+}
+} // namespace vllm
+
+// Macro to dispatch the kernel based on the data type.
+#define CALL_CP_GATHER_CACHE(CPY_DTYPE) \
+ vllm::cp_gather_cache<<>>( \
+ reinterpret_cast(src_cache.data_ptr()), \
+ reinterpret_cast(dst.data_ptr()), \
+ block_table.data_ptr(), cu_seq_lens.data_ptr(), \
+ block_size, entry_size, block_table_stride, cache_block_stride, \
+ cache_entry_stride, dst_entry_stride, seq_starts_ptr);
+
+// Gather sequences from the cache into the destination tensor.
+// - cu_seq_lens contains the cumulative sequence lengths for each batch
+// - block_table contains the cache block indices for each sequence
+// - Optionally, seq_starts (if provided) offsets the starting slot index by
+// seq_starts[bid]
+void cp_gather_cache(
+ torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
+ torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
+ torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
+ torch::Tensor const& cu_seq_lens, // [BATCH+1]
+ int64_t batch_size,
+ std::optional seq_starts = std::nullopt) {
+ at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
+ const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+ int32_t block_size = src_cache.size(1);
+ int32_t entry_size = src_cache.flatten(2, -1).size(2);
+
+ TORCH_CHECK(block_table.dtype() == torch::kInt32,
+ "block_table must be int32");
+ TORCH_CHECK(cu_seq_lens.dtype() == torch::kInt32,
+ "cu_seq_lens must be int32");
+ if (seq_starts.has_value()) {
+ TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
+ "seq_starts must be int32");
+ }
+
+ TORCH_CHECK(src_cache.device() == dst.device(),
+ "src_cache and dst must be on the same device");
+ TORCH_CHECK(src_cache.device() == block_table.device(),
+ "src_cache and block_table must be on the same device");
+ TORCH_CHECK(src_cache.device() == cu_seq_lens.device(),
+ "src_cache and cu_seq_lens must be on the same device");
+ if (seq_starts.has_value()) {
+ TORCH_CHECK(src_cache.device() == seq_starts.value().device(),
+ "src_cache and seq_starts must be on the same device");
+ }
+
+ int64_t block_table_stride = block_table.stride(0);
+ int64_t cache_block_stride = src_cache.stride(0);
+ int64_t cache_entry_stride = src_cache.stride(1);
+ int64_t dst_entry_stride = dst.stride(0);
+
+ // Decide on the number of splits based on the batch size.
+ int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
+ dim3 grid(batch_size, num_splits);
+ dim3 block(1024);
+
+ TORCH_CHECK(src_cache.dtype() == dst.dtype(),
+ "src_cache and dst must have the same dtype");
+
+ const int dtype_bits = src_cache.element_size() * 8;
+ const int32_t* seq_starts_ptr =
+ seq_starts.has_value() ? seq_starts.value().data_ptr() : nullptr;
+
+ if (dtype_bits == 32) {
+ CALL_CP_GATHER_CACHE(uint32_t);
+ } else if (dtype_bits == 16) {
+ CALL_CP_GATHER_CACHE(uint16_t);
+ } else if (dtype_bits == 8) {
+ CALL_CP_GATHER_CACHE(uint8_t);
+ } else {
+ TORCH_CHECK(false, "Unsupported data type width: ", dtype_bits);
+ }
+}
diff --git a/csrc/cpu/cpu_types_vxe.hpp b/csrc/cpu/cpu_types_vxe.hpp
index ab8cbbbf4..51bca37e6 100644
--- a/csrc/cpu/cpu_types_vxe.hpp
+++ b/csrc/cpu/cpu_types_vxe.hpp
@@ -12,7 +12,7 @@ namespace vec_op {
#define vec_sub(a, b) ((a) - (b))
#define vec_mul(a, b) ((a) * (b))
#define vec_div(a, b) ((a) / (b))
-#define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebaic
+#define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebraic
#define vec_sl(a, b) ((a) << (b)) // Vector Shift Left
// FIXME: FP16 is not fully supported in Torch-CPU
diff --git a/csrc/cpu/dnnl_helper.cpp b/csrc/cpu/dnnl_helper.cpp
index f3f00edb3..6def0e061 100644
--- a/csrc/cpu/dnnl_helper.cpp
+++ b/csrc/cpu/dnnl_helper.cpp
@@ -22,6 +22,23 @@ void release_dnnl_matmul_handler(int64_t handler) {
delete ptr;
}
+DNNLScratchPadManager::DNNLScratchPadManager() : size_(0), ptr_(nullptr) {
+ this->realloc(allocation_unit * 128);
+}
+
+void DNNLScratchPadManager::realloc(size_t new_size) {
+ new_size = round(new_size);
+ if (new_size > size_) {
+ ptr_ = std::aligned_alloc(64, new_size);
+ size_ = new_size;
+ }
+}
+
+DNNLScratchPadManager* DNNLScratchPadManager::get_dnnl_scratchpad_manager() {
+ static DNNLScratchPadManager manager;
+ return &manager;
+}
+
template
class DNNLPrimitiveCache {
public:
@@ -166,6 +183,23 @@ struct hash {
hash()(static_cast(val.bias_type));
}
};
+
+template <>
+struct hash {
+ size_t operator()(
+ const MatMulPrimitiveHandler::ClassMatmulCacheKey& val) const {
+ return hash()(val.b_n_size) ^ hash()(val.b_k_size);
+ }
+};
+
+template <>
+struct hash {
+ size_t operator()(const MatMulPrimitiveHandler::MSizeCacheKey& val) const {
+ return hash()(val.a_m_size) ^
+ hash()(val.a_m_stride) ^ hash()(val.use_bias) ^
+ hash()(static_cast(val.bias_type));
+ }
+};
} // namespace std
bool operator==(const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& l,
@@ -181,6 +215,17 @@ bool operator==(const W8A8MatMulPrimitiveHandler::MSizeCacheKey& l,
l.bias_type == r.bias_type;
}
+bool operator==(const MatMulPrimitiveHandler::ClassMatmulCacheKey& l,
+ const MatMulPrimitiveHandler::ClassMatmulCacheKey& r) {
+ return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size;
+}
+
+bool operator==(const MatMulPrimitiveHandler::MSizeCacheKey& l,
+ const MatMulPrimitiveHandler::MSizeCacheKey& r) {
+ return l.a_m_size == r.a_m_size && l.a_m_stride == r.a_m_stride &&
+ l.use_bias == r.use_bias && l.bias_type == r.bias_type;
+}
+
static std::shared_ptr
get_w8a8_class_primitive_cache(
const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& key,
@@ -239,6 +284,11 @@ void W8A8MatMulPrimitiveHandler::execute(ExecArgs& args) {
}
dnnl::matmul matmul = get_matmul_cache(args);
+
+ auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(5);
+ scratchpad_storage->set_data_handle(
+ DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data());
+
matmul.execute(default_stream(), memory_cache_);
default_stream().wait();
}
@@ -257,6 +307,8 @@ dnnl::matmul W8A8MatMulPrimitiveHandler::get_matmul_cache(
return m_size_cache_->get_or_create(key, [&]() {
dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
+ auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager();
+ manager->realloc(desc.scratchpad_desc().get_size());
return dnnl::matmul(desc);
});
}
@@ -300,6 +352,11 @@ void W8A8MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) {
dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
default_engine(), nullptr);
set_runtime_memory_ptr(4, memory_cache_[DNNL_ARG_BIAS].get());
+
+ memory_cache_[DNNL_ARG_SCRATCHPAD] =
+ dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
+ default_engine(), nullptr);
+ set_runtime_memory_ptr(5, memory_cache_[DNNL_ARG_SCRATCHPAD].get());
}
dnnl::matmul::primitive_desc W8A8MatMulPrimitiveHandler::create_primitive_desc(
@@ -319,6 +376,9 @@ dnnl::matmul::primitive_desc W8A8MatMulPrimitiveHandler::create_primitive_desc(
dnnl::memory::format_tag::ab);
dnnl::primitive_attr attr;
+
+ attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+
// For PER_TOKEN, scales will be applied in outside epilogue
if (a_qs_ == QuantizationStrategy::PER_TENSOR) {
attr.set_scales_mask(DNNL_ARG_SRC, 0);
@@ -344,3 +404,120 @@ dnnl::matmul::primitive_desc W8A8MatMulPrimitiveHandler::create_primitive_desc(
attr);
}
}
+
+MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args)
+ : DNNLMatMulPrimitiveHandler(
+ static_cast(args), args.ab_type),
+ m_size_cache_(nullptr) {
+ assert(ab_type_ == dnnl::memory::data_type::f32 ||
+ ab_type_ == dnnl::memory::data_type::bf16 ||
+ ab_type_ == dnnl::memory::data_type::f16);
+ prepack_weight(args.b_ptr,
+ create_primitive_desc(
+ MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
+ .a_m_stride = DNNL_RUNTIME_DIM_VAL,
+ .use_bias = false,
+ .bias_type = dnnl::memory::data_type::undef},
+ true)
+ .weights_desc());
+ init_runtime_memory_cache(args);
+}
+
+static std::shared_ptr
+get_matul_class_primitive_cache(
+ const MatMulPrimitiveHandler::ClassMatmulCacheKey& key,
+ int64_t cache_size) {
+ static MatMulPrimitiveHandler::ClassMatmulCache cache(128);
+ assert(cache_size > 0);
+ return cache.get_or_create(key, [&]() {
+ return std::make_shared(cache_size);
+ });
+}
+
+void MatMulPrimitiveHandler::execute(ExecArgs& args) {
+ auto&& [a_storage, a_mem_desc] = get_runtime_memory_ptr(0);
+ auto&& [c_storage, c_mem_desc] = get_runtime_memory_ptr(1);
+ a_storage->set_data_handle((void*)args.a_ptr);
+ a_mem_desc->dims[0] = args.a_m_size;
+ a_mem_desc->format_desc.blocking.strides[0] = args.a_m_stride;
+ c_storage->set_data_handle((void*)args.c_ptr);
+ c_mem_desc->dims[0] = args.a_m_size;
+
+ if (args.use_bias) {
+ auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(2);
+ bias_storage->set_data_handle((void*)args.bias_ptr);
+ }
+
+ dnnl::matmul matmul = get_matmul_cache(args);
+
+ auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(3);
+ scratchpad_storage->set_data_handle(
+ DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data());
+
+ matmul.execute(default_stream(), memory_cache_);
+ default_stream().wait();
+}
+
+dnnl::matmul MatMulPrimitiveHandler::get_matmul_cache(
+ const MSizeCacheKey& key) {
+ if (m_size_cache_.get() == nullptr) {
+ ClassMatmulCacheKey key = {.b_n_size = b_n_size_, .b_k_size = b_k_size_};
+ m_size_cache_ = get_matul_class_primitive_cache(key, primitive_cache_size_);
+ }
+ return m_size_cache_->get_or_create(key, [&]() {
+ dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
+ auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager();
+ manager->realloc(desc.scratchpad_desc().get_size());
+ return dnnl::matmul(desc);
+ });
+}
+
+dnnl::matmul::primitive_desc MatMulPrimitiveHandler::create_primitive_desc(
+ const MSizeCacheKey& key, bool first_time) {
+ dnnl::memory::desc a_md;
+ dnnl::memory::desc b_md;
+ if (first_time) {
+ a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_,
+ dnnl::memory::format_tag::ab);
+ b_md = dnnl::memory::desc({b_k_size_, b_n_size_}, b_type_,
+ dnnl::memory::format_tag::any);
+ } else {
+ a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_,
+ {key.a_m_stride, 1});
+ b_md = b_target_mem_desc_;
+ }
+ dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_,
+ dnnl::memory::format_tag::ab);
+
+ dnnl::primitive_attr attr;
+ attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+
+ if (key.use_bias) {
+ dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1});
+ return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md,
+ c_md, attr);
+ } else {
+ return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
+ attr);
+ }
+}
+
+void MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) {
+ memory_cache_[DNNL_ARG_SRC] = dnnl::memory(
+ {{1, b_k_size_}, b_type_, {b_k_size_, 1}}, default_engine(), nullptr);
+ set_runtime_memory_ptr(0, memory_cache_[DNNL_ARG_SRC].get());
+ memory_cache_[DNNL_ARG_DST] =
+ dnnl::memory({{1, b_n_size_}, c_type_, dnnl::memory::format_tag::ab},
+ default_engine(), nullptr);
+ set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get());
+
+ memory_cache_[DNNL_ARG_BIAS] =
+ dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
+ default_engine(), nullptr);
+ set_runtime_memory_ptr(2, memory_cache_[DNNL_ARG_BIAS].get());
+
+ memory_cache_[DNNL_ARG_SCRATCHPAD] =
+ dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
+ default_engine(), nullptr);
+ set_runtime_memory_ptr(3, memory_cache_[DNNL_ARG_SCRATCHPAD].get());
+}
diff --git a/csrc/cpu/dnnl_helper.h b/csrc/cpu/dnnl_helper.h
index 54ceefced..ad6773d2b 100644
--- a/csrc/cpu/dnnl_helper.h
+++ b/csrc/cpu/dnnl_helper.h
@@ -59,6 +59,30 @@ constexpr inline dnnl::memory::data_type get_dnnl_type() {
return DNNLType>::type;
}
+class DNNLScratchPadManager {
+ public:
+ static constexpr size_t allocation_unit = 4 * 1024 * 1024; // 4KB
+
+ static DNNLScratchPadManager* get_dnnl_scratchpad_manager();
+
+ DNNLScratchPadManager();
+
+ template
+ T* get_data() {
+ return reinterpret_cast(ptr_);
+ }
+
+ static size_t round(size_t size) {
+ return ((size + allocation_unit - 1) / allocation_unit) * allocation_unit;
+ }
+
+ void realloc(size_t new_size);
+
+ private:
+ size_t size_;
+ void* ptr_;
+};
+
class DNNLMatMulPrimitiveHandler {
public:
virtual ~DNNLMatMulPrimitiveHandler() = default;
@@ -166,4 +190,54 @@ class W8A8MatMulPrimitiveHandler : public DNNLMatMulPrimitiveHandler {
std::shared_ptr m_size_cache_;
};
+class MatMulPrimitiveHandler : public DNNLMatMulPrimitiveHandler {
+ public:
+ struct Args : public DNNLMatMulPrimitiveHandler::Args {
+ dnnl::memory::data_type ab_type;
+ };
+
+ struct ClassMatmulCacheKey {
+ dnnl_dim_t b_n_size;
+ dnnl_dim_t b_k_size;
+
+ friend bool operator==(const ClassMatmulCacheKey& l,
+ const ClassMatmulCacheKey& r);
+ };
+
+ struct MSizeCacheKey {
+ dnnl_dim_t a_m_size;
+ dnnl_dim_t a_m_stride;
+ bool use_bias;
+ dnnl::memory::data_type bias_type;
+
+ friend bool operator==(const MSizeCacheKey& l, const MSizeCacheKey& r);
+ };
+
+ using MSizeCache = DNNLPrimitiveCache;
+ using ClassMatmulCache =
+ DNNLPrimitiveCache>;
+
+ struct ExecArgs : public MSizeCacheKey {
+ const void* a_ptr;
+ const void* bias_ptr;
+ void* c_ptr;
+ };
+
+ public:
+ MatMulPrimitiveHandler(const Args& args);
+
+ void execute(ExecArgs& args);
+
+ private:
+ dnnl::matmul::primitive_desc create_primitive_desc(const MSizeCacheKey& key,
+ bool first_time);
+
+ void init_runtime_memory_cache(const Args& args);
+
+ dnnl::matmul get_matmul_cache(const MSizeCacheKey& key);
+
+ private:
+ std::shared_ptr m_size_cache_;
+};
+
#endif
diff --git a/csrc/cpu/dnnl_kernels.cpp b/csrc/cpu/dnnl_kernels.cpp
index acc3b9ecd..9a3af4ac9 100644
--- a/csrc/cpu/dnnl_kernels.cpp
+++ b/csrc/cpu/dnnl_kernels.cpp
@@ -145,7 +145,8 @@ void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
}
}
- float scale_val, azp_val;
+ float scale_val;
+ float azp_val = 0.0f;
if constexpr (AZP) {
float max_scalar = max_value.reduce_max();
float min_scalar = min_value.reduce_min();
@@ -379,6 +380,7 @@ void onednn_scaled_mm(
exec_args.a_ptr = a.data_ptr();
exec_args.a_m_size = a.size(0);
exec_args.bias_ptr = nullptr;
+ exec_args.bias_type = get_dnnl_type();
exec_args.use_bias = false;
exec_args.a_scales_ptr = nullptr;
exec_args.a_zero_points_ptr = nullptr;
@@ -492,3 +494,56 @@ void dynamic_scaled_int8_quant(
}
});
}
+
+int64_t create_onednn_mm_handler(const torch::Tensor& b,
+ int64_t primitive_cache_size) {
+ TORCH_CHECK(b.dim() == 2);
+
+ MatMulPrimitiveHandler::Args args;
+ args.primitive_cache_size = primitive_cache_size;
+
+ args.b_k_size = b.size(0);
+ args.b_k_stride = b.stride(0);
+ args.b_n_size = b.size(1);
+ args.b_n_stride = b.stride(1);
+ args.b_ptr = b.data_ptr();
+
+ VLLM_DISPATCH_FLOATING_TYPES(b.scalar_type(), "create_onednn_mm_handler",
+ [&] {
+ args.c_type = get_dnnl_type();
+ args.ab_type = get_dnnl_type();
+ });
+
+ return reinterpret_cast(new MatMulPrimitiveHandler(args));
+}
+
+void onednn_mm(torch::Tensor& c, // [M, OC], row-major
+ const torch::Tensor& a, // [M, IC], row-major
+ const std::optional& bias, int64_t handler) {
+ CPU_KERNEL_GUARD_IN(onednn_mm)
+ TORCH_CHECK(a.dim() == 2);
+ TORCH_CHECK(a.stride(-1) == 1);
+ TORCH_CHECK(c.is_contiguous());
+ MatMulPrimitiveHandler* ptr =
+ reinterpret_cast(handler);
+
+ MatMulPrimitiveHandler::ExecArgs exec_args;
+ exec_args.a_m_size = a.size(0);
+ exec_args.a_m_stride = a.stride(0);
+
+ VLLM_DISPATCH_FLOATING_TYPES(a.scalar_type(), "onednn_mm", [&] {
+ if (bias.has_value()) {
+ exec_args.use_bias = true;
+ exec_args.bias_type = get_dnnl_type();
+ exec_args.bias_ptr = bias->data_ptr();
+ } else {
+ exec_args.use_bias = false;
+ exec_args.bias_type = get_dnnl_type();
+ exec_args.bias_ptr = nullptr;
+ }
+ exec_args.a_ptr = a.data_ptr();
+ exec_args.c_ptr = c.data_ptr();
+
+ ptr->execute(exec_args);
+ });
+}
diff --git a/csrc/cpu/sgl-kernels/moe.cpp b/csrc/cpu/sgl-kernels/moe.cpp
index beeccff78..94b24c2f1 100644
--- a/csrc/cpu/sgl-kernels/moe.cpp
+++ b/csrc/cpu/sgl-kernels/moe.cpp
@@ -215,7 +215,7 @@ int moe_align_block_size(
offsets[mb + 1] = sorted_id_size(sorted_ids + mb * BLOCK_M);
}
});
- // TODO: do we need to vecterize this ?
+ // TODO: do we need to vectorize this ?
for (int mb = 0; mb < num_token_blocks; ++mb) {
offsets[mb + 1] += offsets[mb];
}
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index c9f426bdf..98c3ebc5a 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -21,6 +21,12 @@ void onednn_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
const std::optional& bias,
int64_t handler);
+int64_t create_onednn_mm_handler(const torch::Tensor& b,
+ int64_t primitive_cache_size);
+
+void onednn_mm(torch::Tensor& c, const torch::Tensor& a,
+ const std::optional& bias, int64_t handler);
+
void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
torch::Tensor& kv_cache, double scale,
torch::Tensor& block_tables, torch::Tensor& seq_lens);
@@ -153,6 +159,18 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.def("release_dnnl_matmul_handler(int handler) -> ()",
&release_dnnl_matmul_handler);
+ // Create oneDNN GEMM handler
+ ops.def(
+ "create_onednn_mm_handler(Tensor b, int "
+ "primitive_cache_size) -> int",
+ &create_onednn_mm_handler);
+
+ // oneDNN GEMM
+ ops.def(
+ "onednn_mm(Tensor! c, Tensor a, Tensor? bias, "
+ "int handler) -> ()");
+ ops.impl("onednn_mm", torch::kCPU, &onednn_mm);
+
// Create oneDNN W8A8 handler
ops.def(
"create_onednn_scaled_mm_handler(Tensor b, Tensor b_scales, ScalarType "
diff --git a/csrc/custom_all_reduce.cuh b/csrc/custom_all_reduce.cuh
index 44709b459..58926f642 100644
--- a/csrc/custom_all_reduce.cuh
+++ b/csrc/custom_all_reduce.cuh
@@ -15,6 +15,8 @@ typedef __hip_bfloat16 nv_bfloat16;
#include