diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py index 68aff793a..76f6d7aec 100644 --- a/.buildkite/check-wheel-size.py +++ b/.buildkite/check-wheel-size.py @@ -5,11 +5,11 @@ import os import sys import zipfile -# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB -# Note that we have 400 MiB quota, please use it wisely. -# See https://github.com/pypi/support/issues/3792 . +# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB +# Note that we have 800 MiB quota, please use it wisely. +# See https://github.com/pypi/support/issues/6326 . # Please also sync the value with the one in Dockerfile. -VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400)) +VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450)) def print_top_10_largest_files(zip_file): diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py index 50431d0cd..5ea5a50a2 100644 --- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py @@ -218,7 +218,7 @@ if __name__ == "__main__": "--xaxis", type=str, default="# of max concurrency.", - help="column name to use as X Axis in comparision graph", + help="column name to use as X Axis in comparison graph", ) args = parser.parse_args() diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json index 2d88a0b30..f758097e0 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json @@ -1,6 +1,6 @@ [ { - "test_name": "serving_llama8B_tp1_sharegpt", + "test_name": "serving_llama8B_bf16_tp1_sharegpt", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { @@ -32,7 +32,7 @@ } }, { - "test_name": "serving_llama8B_tp2_sharegpt", + "test_name": "serving_llama8B_bf16_tp2_sharegpt", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { @@ -64,7 +64,7 @@ } }, { - "test_name": "serving_llama8B_tp4_sharegpt", + "test_name": "serving_llama8B_bf16_tp4_sharegpt", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { @@ -96,7 +96,7 @@ } }, { - "test_name": "serving_llama8B_tp1_random_128_128", + "test_name": "serving_llama8B_bf16_tp1_random_128_128", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], "server_environment_variables": { @@ -131,7 +131,7 @@ } }, { - "test_name": "serving_llama8B_tp2_random_128_128", + "test_name": "serving_llama8B_bf16_tp2_random_128_128", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], "server_environment_variables": { @@ -166,7 +166,7 @@ } }, { - "test_name": "serving_llama8B_tp4_random_128_128", + "test_name": "serving_llama8B_bf16_tp4_random_128_128", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], "server_environment_variables": { @@ -198,5 +198,413 @@ "random-output-len": 128, "num_prompts": 1000 } + }, + { + "test_name": "serving_llama8B_int8_tp1_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int8_tp2_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int8_tp4_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int8_tp1_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int8_tp2_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int8_tp4_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int4_tp1_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int4_tp2_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int4_tp4_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int4_tp1_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int4_tp2_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int4_tp4_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } } ] diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json index 823abbaa9..ce396d6e5 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json @@ -1,6 +1,6 @@ [ { - "test_name": "serving_llama8B_pp1_sharegpt", + "test_name": "serving_llama8B_bf16_pp1_sharegpt", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { @@ -32,7 +32,39 @@ } }, { - "test_name": "serving_llama8B_pp3_sharegpt", + "test_name": "serving_llama8B_bf16_tp2_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_bf16_pp3_sharegpt", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { @@ -64,7 +96,7 @@ } }, { - "test_name": "serving_llama8B_tp2pp3_sharegpt", + "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { @@ -97,7 +129,7 @@ } }, { - "test_name": "serving_llama8B_pp1_random_128_128", + "test_name": "serving_llama8B_bf16_pp1_random_128_128", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], "server_environment_variables": { @@ -132,7 +164,42 @@ } }, { - "test_name": "serving_llama8B_pp3_random_128_128", + "test_name": "serving_llama8B_bf16_tp2_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_bf16_pp3_random_128_128", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], "server_environment_variables": { @@ -167,7 +234,7 @@ } }, { - "test_name": "serving_llama8B_tp2pp3_random_128_128", + "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128", "qps_list": ["inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], "server_environment_variables": { @@ -201,5 +268,553 @@ "ignore-eos": "", "num_prompts": 1000 } + }, + { + "test_name": "serving_llama8B_int8_pp1_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "pipeline_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int8_tp2_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int8_pp3_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int8_tp2pp3_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 2, + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int8_pp1_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "pipeline_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int8_tp2_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int8_pp3_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int8_tp2pp3_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 2, + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int4_pp1_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "pipeline_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int4_tp2_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int4_pp3_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int4_tp2pp3_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 2, + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int4_pp1_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "pipeline_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int4_tp2_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int4_pp3_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int4_tp2pp3_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 2, + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } } ] diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index f96c38bf5..a1de41652 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -1,21 +1,24 @@ steps: - # aarch64 + CUDA builds - - label: "Build arm64 wheel - CUDA 12.8" - id: build-wheel-arm64-cuda-12-8 + # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9 + - label: "Build arm64 wheel - CUDA 12.9" + id: build-wheel-arm64-cuda-12-9 agents: queue: arm64_cpu_queue_postmerge commands: # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here: # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7 - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "bash .buildkite/scripts/upload-wheels.sh" env: DOCKER_BUILDKIT: "1" - # x86 + CUDA builds + - block: "Build CUDA 12.8 wheel" + key: block-build-cu128-wheel + - label: "Build wheel - CUDA 12.8" + depends_on: block-build-cu128-wheel id: build-wheel-cuda-12-8 agents: queue: cpu_queue_postmerge @@ -44,44 +47,63 @@ steps: env: DOCKER_BUILDKIT: "1" - # Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working. - # However, this block can be uncommented to save some compute hours. - # - block: "Build CUDA 11.8 wheel" - # key: block-build-cu118-wheel - - - label: "Build wheel - CUDA 11.8" - # depends_on: block-build-cu118-wheel - id: build-wheel-cuda-11-8 + # x86 + CUDA builds + - label: "Build wheel - CUDA 12.9" + depends_on: ~ + id: build-wheel-cuda-12-9 agents: queue: cpu_queue_postmerge commands: - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "bash .buildkite/scripts/upload-wheels.sh" env: DOCKER_BUILDKIT: "1" - - block: "Build release image" + - label: "Build release image (x86)" depends_on: ~ - key: block-release-image-build - - - label: "Build release image" - depends_on: block-release-image-build - id: build-release-image + id: build-release-image-x86 agents: queue: cpu_queue_postmerge commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." + - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)" + # re-tag to default image tag and push, just in case arm64 build fails + - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" + # PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9 + - label: "Build release image (arm64)" + depends_on: ~ + id: build-release-image-arm64 + agents: + queue: arm64_cpu_queue_postmerge + commands: + - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." + - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)" + + # Add job to create multi-arch manifest + - label: "Create multi-arch manifest" + depends_on: + - build-release-image-x86 + - build-release-image-arm64 + id: create-multi-arch-manifest + agents: + queue: cpu_queue_postmerge + commands: + - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" + - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend" + - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" + - label: "Annotate release workflow" depends_on: - - build-release-image + - create-multi-arch-manifest - build-wheel-cuda-12-8 - build-wheel-cuda-12-6 - - build-wheel-cuda-11-8 + - build-wheel-cuda-12-9 id: annotate-release-workflow agents: queue: cpu_queue_postmerge @@ -128,18 +150,24 @@ steps: env: DOCKER_BUILDKIT: "1" - - block: "Build Neuron release image" - key: block-neuron-release-image-build - depends_on: ~ - - - label: "Build and publish Neuron release image" - depends_on: block-neuron-release-image-build + - label: "Build and publish nightly multi-arch image to DockerHub" + depends_on: + - create-multi-arch-manifest + if: build.env("NIGHTLY") == "1" agents: - queue: neuron-postmerge + queue: cpu_queue_postmerge commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ." - - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest" - - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)" + - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" + - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly" + - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT" + - "docker push vllm/vllm-openai:nightly" + - "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT" + # Clean up old nightly builds (keep only last 14) + - "bash .buildkite/scripts/cleanup-nightly-builds.sh" + plugins: + - docker-login#v3.0.0: + username: vllmbot + password-env: DOCKERHUB_TOKEN env: DOCKER_BUILDKIT: "1" diff --git a/.buildkite/scripts/cleanup-nightly-builds.sh b/.buildkite/scripts/cleanup-nightly-builds.sh new file mode 100755 index 000000000..1a82f7d08 --- /dev/null +++ b/.buildkite/scripts/cleanup-nightly-builds.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +set -ex + +# Clean up old nightly builds from DockerHub, keeping only the last 14 builds +# This script uses DockerHub API to list and delete old tags with "nightly-" prefix + +# DockerHub API endpoint for vllm/vllm-openai repository +REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags" + +# Get DockerHub token from environment +if [ -z "$DOCKERHUB_TOKEN" ]; then + echo "Error: DOCKERHUB_TOKEN environment variable is not set" + exit 1 +fi + +# Function to get all tags from DockerHub +get_all_tags() { + local page=1 + local all_tags="" + + while true; do + local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \ + "$REPO_API_URL?page=$page&page_size=100") + + # Get both last_updated timestamp and tag name, separated by | + local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"') + + if [ -z "$tags" ]; then + break + fi + + all_tags="$all_tags$tags"$'\n' + page=$((page + 1)) + done + + # Sort by timestamp (newest first) and extract just the tag names + echo "$all_tags" | sort -r | cut -d'|' -f2 +} + +delete_tag() { + local tag_name="$1" + echo "Deleting tag: $tag_name" + + local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name" + local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url") + + if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then + echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')" + else + echo "Successfully deleted tag: $tag_name" + fi +} + +# Get all nightly- prefixed tags, sorted by last_updated timestamp (newest first) +echo "Fetching all tags from DockerHub..." +all_tags=$(get_all_tags) + +if [ -z "$all_tags" ]; then + echo "No tags found to clean up" + exit 0 +fi + +# Count total tags +total_tags=$(echo "$all_tags" | wc -l) +echo "Found $total_tags tags" + +# Keep only the last 14 builds (including the current one) +tags_to_keep=14 +tags_to_delete=$((total_tags - tags_to_keep)) + +if [ $tags_to_delete -le 0 ]; then + echo "No tags need to be deleted (only $total_tags tags found, keeping $tags_to_keep)" + exit 0 +fi + +echo "Will delete $tags_to_delete old tags, keeping the newest $tags_to_keep" + +# Get tags to delete (skip the first $tags_to_keep tags) +tags_to_delete_list=$(echo "$all_tags" | tail -n +$((tags_to_keep + 1))) + +if [ -z "$tags_to_delete_list" ]; then + echo "No tags to delete" + exit 0 +fi + +# Delete old tags +echo "Deleting old tags..." +while IFS= read -r tag; do + if [ -n "$tag" ]; then + delete_tag "$tag" + # Add a small delay to avoid rate limiting + sleep 1 + fi +done <<< "$tags_to_delete_list" + +echo "Cleanup completed successfully" diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index df0bae0c9..c395011a2 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -164,7 +164,6 @@ if [[ $commands == *" entrypoints/llm "* ]]; then --ignore=entrypoints/llm/test_chat.py \ --ignore=entrypoints/llm/test_accuracy.py \ --ignore=entrypoints/llm/test_init.py \ - --ignore=entrypoints/llm/test_generate_multiple_loras.py \ --ignore=entrypoints/llm/test_prompt_validation.py "} fi diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index 9dec9f8e9..0f734763f 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -25,8 +25,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu . # Run the image, setting --shm-size=4g for tensor parallel. -docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" -docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 +docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" +docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 function cpu_tests() { set -e @@ -49,23 +49,23 @@ function cpu_tests() { # Run kernel tests docker exec cpu-test-"$NUMA_NODE" bash -c " set -e - pytest -v -s tests/kernels/test_onednn.py" + pytest -x -v -s tests/kernels/test_onednn.py" # Run basic model test docker exec cpu-test-"$NUMA_NODE" bash -c " set -e # Note: disable until supports V1 - # pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model - # pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model + # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model + # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model # Note: disable Bart until supports V1 - pytest -v -s tests/models/language/generation -m cpu_model \ + pytest -x -v -s tests/models/language/generation -m cpu_model \ --ignore=tests/models/language/generation/test_bart.py - VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \ + VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \ --ignore=tests/models/language/generation/test_bart.py - pytest -v -s tests/models/language/pooling -m cpu_model - pytest -v -s tests/models/multimodal/generation \ + pytest -x -v -s tests/models/language/pooling -m cpu_model + pytest -x -v -s tests/models/multimodal/generation \ --ignore=tests/models/multimodal/generation/test_mllama.py \ --ignore=tests/models/multimodal/generation/test_pixtral.py \ -m cpu_model" @@ -73,33 +73,49 @@ function cpu_tests() { # Run compressed-tensor test docker exec cpu-test-"$NUMA_NODE" bash -c " set -e - pytest -s -v \ + pytest -x -s -v \ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]" # Note: disable it until supports V1 # Run AWQ test # docker exec cpu-test-"$NUMA_NODE" bash -c " # set -e - # VLLM_USE_V1=0 pytest -s -v \ + # VLLM_USE_V1=0 pytest -x -s -v \ # tests/quantization/test_ipex_quant.py" # Run multi-lora tests docker exec cpu-test-"$NUMA_NODE" bash -c " set -e - pytest -s -v \ + pytest -x -s -v \ tests/lora/test_qwen2vl.py" - # online serving + # online serving: tp+pp docker exec cpu-test-"$NUMA_NODE" bash -c ' set -e VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 & + server_pid=$! timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 vllm bench serve \ --backend vllm \ --dataset-name random \ --model meta-llama/Llama-3.2-3B-Instruct \ --num-prompts 20 \ - --endpoint /v1/completions' + --endpoint /v1/completions + kill -s SIGTERM $server_pid &' + + # online serving: tp+dp + docker exec cpu-test-"$NUMA_NODE" bash -c ' + set -e + VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 & + server_pid=$! + timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 + vllm bench serve \ + --backend vllm \ + --dataset-name random \ + --model meta-llama/Llama-3.2-3B-Instruct \ + --num-prompts 20 \ + --endpoint /v1/completions + kill -s SIGTERM $server_pid &' } # All of CPU tests are expected to be finished less than 40 mins. diff --git a/.buildkite/scripts/hardware_ci/run-neuron-test.sh b/.buildkite/scripts/hardware_ci/run-neuron-test.sh deleted file mode 100644 index a397457c8..000000000 --- a/.buildkite/scripts/hardware_ci/run-neuron-test.sh +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/bash - -# This script build the Neuron docker image and run the API server inside the container. -# It serves a sanity check for compilation and basic model usage. -set -e -set -v - -image_name="neuron/vllm-ci" -container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" - -HF_CACHE="$(realpath ~)/huggingface" -mkdir -p "${HF_CACHE}" -HF_MOUNT="/root/.cache/huggingface" -HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN) - -NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache" -mkdir -p "${NEURON_COMPILE_CACHE_URL}" -NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache" - -# Try building the docker image -aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws - -# prune old image and containers to save disk space, and only once a day -# by using a timestamp file in tmp. -if [ -f /tmp/neuron-docker-build-timestamp ]; then - last_build=$(cat /tmp/neuron-docker-build-timestamp) - current_time=$(date +%s) - if [ $((current_time - last_build)) -gt 86400 ]; then - # Remove dangling images (those that are not tagged and not used by any container) - docker image prune -f - # Remove unused volumes / force the system prune for old images as well. - docker volume prune -f && docker system prune -f - echo "$current_time" > /tmp/neuron-docker-build-timestamp - fi -else - date "+%s" > /tmp/neuron-docker-build-timestamp -fi - -docker build -t "${image_name}" -f docker/Dockerfile.neuron . - -# Setup cleanup -remove_docker_container() { - docker image rm -f "${image_name}" || true; -} -trap remove_docker_container EXIT - -# Run the image -docker run --rm -it --device=/dev/neuron0 --network bridge \ - -v "${HF_CACHE}:${HF_MOUNT}" \ - -e "HF_HOME=${HF_MOUNT}" \ - -e "HF_TOKEN=${HF_TOKEN}" \ - -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \ - -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \ - --name "${container_name}" \ - ${image_name} \ - /bin/bash -c " - set -e; # Exit on first error - python3 /workspace/vllm/examples/offline_inference/neuron.py; - python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys; - for f in /workspace/vllm/tests/neuron/2_core/*.py; do - echo \"Running test file: \$f\"; - python3 -m pytest \$f -v --capture=tee-sys; - done - " \ No newline at end of file diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh index 73f3e63fb..8c9b00990 100644 --- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh @@ -30,10 +30,12 @@ docker run \ bash -c ' set -e echo $ZE_AFFINITY_MASK - VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager - VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE - VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray - VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp + pip install tblib==3.1.0 + python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager + python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE + python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray + python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp + VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager cd tests pytest -v -s v1/core pytest -v -s v1/engine diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh index 745f285c0..43aa8c47b 100644 --- a/.buildkite/scripts/upload-wheels.sh +++ b/.buildkite/scripts/upload-wheels.sh @@ -58,14 +58,15 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel" aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" -if [[ $normal_wheel == *"cu118"* ]]; then - # if $normal_wheel matches cu118, do not upload the index.html - echo "Skipping index files for cu118 wheels" -elif [[ $normal_wheel == *"cu126"* ]]; then +if [[ $normal_wheel == *"cu126"* ]]; then # if $normal_wheel matches cu126, do not upload the index.html echo "Skipping index files for cu126 wheels" +elif [[ $normal_wheel == *"cu128"* ]]; then + # if $normal_wheel matches cu128, do not upload the index.html + echo "Skipping index files for cu128 wheels" else - # only upload index.html for cu128 wheels (default wheels) + # only upload index.html for cu129 wheels (default wheels) as it + # is available on both x86 and arm64 aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html" aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html" fi @@ -74,14 +75,15 @@ fi aws s3 cp "$wheel" "s3://vllm-wheels/nightly/" aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/" -if [[ $normal_wheel == *"cu118"* ]]; then - # if $normal_wheel matches cu118, do not upload the index.html - echo "Skipping index files for cu118 wheels" -elif [[ $normal_wheel == *"cu126"* ]]; then +if [[ $normal_wheel == *"cu126"* ]]; then # if $normal_wheel matches cu126, do not upload the index.html echo "Skipping index files for cu126 wheels" +elif [[ $normal_wheel == *"cu128"* ]]; then + # if $normal_wheel matches cu128, do not upload the index.html + echo "Skipping index files for cu128 wheels" else - # only upload index.html for cu128 wheels (default wheels) + # only upload index.html for cu129 wheels (default wheels) as it + # is available on both x86 and arm64 aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html" fi diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 0d3b7a294..b0f5fe418 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -41,7 +41,8 @@ steps: commands: - bash standalone_tests/pytorch_nightly_dependency.sh -- label: Async Engine, Inputs, Utils, Worker Test # 24min +- label: Async Engine, Inputs, Utils, Worker Test # 36min + timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ @@ -53,6 +54,7 @@ steps: - tests/utils_ - tests/worker - tests/standalone_tests/lazy_imports.py + - tests/transformers_utils commands: - python3 standalone_tests/lazy_imports.py - pytest -v -s mq_llm_engine # MQLLMEngine @@ -62,8 +64,10 @@ steps: - pytest -v -s multimodal - pytest -v -s utils_ # Utils - pytest -v -s worker # Worker + - pytest -v -s transformers_utils # transformers_utils -- label: Python-only Installation Test +- label: Python-only Installation Test # 10min + timeout_in_minutes: 20 mirror_hardwares: [amdexperimental] source_file_dependencies: - tests/standalone_tests/python_only_compile.sh @@ -71,7 +75,8 @@ steps: commands: - bash standalone_tests/python_only_compile.sh -- label: Basic Correctness Test # 30min +- label: Basic Correctness Test # 20min + timeout_in_minutes: 30 mirror_hardwares: [amdexperimental] fast_check: true torch_nightly: true @@ -88,7 +93,8 @@ steps: - pytest -v -s basic_correctness/test_cpu_offload.py - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py -- label: Core Test # 10min +- label: Core Test # 22min + timeout_in_minutes: 35 mirror_hardwares: [amdexperimental] fast_check: true source_file_dependencies: @@ -98,7 +104,19 @@ steps: commands: - pytest -v -s core -- label: Entrypoints Test (LLM) # 40min +- label: Entrypoints Unit Tests # 5min + timeout_in_minutes: 10 + working_dir: "/vllm-workspace/tests" + fast_check: true + source_file_dependencies: + - vllm/entrypoints + - tests/entrypoints/ + commands: + - pytest -v -s entrypoints/openai/tool_parsers + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + +- label: Entrypoints Integration Test (LLM) # 30min + timeout_in_minutes: 40 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" fast_check: true @@ -109,13 +127,13 @@ steps: - tests/entrypoints/offline_mode commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_collective_rpc.py + - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests -- label: Entrypoints Test (API Server) # 40min +- label: Entrypoints Integration Test (API Server) # 100min + timeout_in_minutes: 130 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" fast_check: true @@ -127,10 +145,24 @@ steps: commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/ - pytest -v -s entrypoints/test_chat_utils.py -- label: Distributed Tests (4 GPUs) # 10min +- label: Entrypoints Integration Test (Pooling) + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + working_dir: "/vllm-workspace/tests" + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/entrypoints/pooling + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/pooling + +- label: Distributed Tests (4 GPUs) # 35min + timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 4 @@ -173,7 +205,8 @@ steps: - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py - popd -- label: EPLB Algorithm Test +- label: EPLB Algorithm Test # 5min + timeout_in_minutes: 15 working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/distributed/eplb @@ -182,6 +215,7 @@ steps: - pytest -v -s distributed/test_eplb_algo.py - label: EPLB Execution Test # 5min + timeout_in_minutes: 15 working_dir: "/vllm-workspace/tests" num_gpus: 4 source_file_dependencies: @@ -190,13 +224,14 @@ steps: commands: - pytest -v -s distributed/test_eplb_execute.py -- label: Metrics, Tracing Test # 10min +- label: Metrics, Tracing Test # 12min + timeout_in_minutes: 20 mirror_hardwares: [amdexperimental] num_gpus: 2 source_file_dependencies: - vllm/ - tests/metrics - - tests/tracing + - tests/v1/tracing commands: - pytest -v -s metrics - "pip install \ @@ -209,7 +244,8 @@ steps: ##### fast check tests ##### ##### 1 GPU test ##### -- label: Regression Test # 5min +- label: Regression Test # 7min + timeout_in_minutes: 20 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ @@ -219,7 +255,8 @@ steps: - pytest -v -s test_regression.py working_dir: "/vllm-workspace/tests" # optional -- label: Engine Test # 10min +- label: Engine Test # 25min + timeout_in_minutes: 40 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ @@ -234,7 +271,29 @@ steps: # OOM in the CI unless we run this separately - pytest -v -s tokenization -- label: V1 Test +- label: V1 Test e2e + engine # 30min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + # TODO: accuracy does not match, whether setting + # VLLM_USE_FLASHINFER_SAMPLER or not on H100. + - pytest -v -s v1/e2e + - pytest -v -s v1/engine + +- label: V1 Test entrypoints # 35min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + - pytest -v -s v1/entrypoints + +- label: V1 Test others # 42min + timeout_in_minutes: 60 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ @@ -242,8 +301,6 @@ steps: commands: # split the test to avoid interference - pytest -v -s v1/core - - pytest -v -s v1/engine - - pytest -v -s v1/entrypoints - pytest -v -s v1/executor - pytest -v -s v1/sample - pytest -v -s v1/logits_processors @@ -256,14 +313,12 @@ steps: - pytest -v -s v1/test_utils.py - pytest -v -s v1/test_oracle.py - pytest -v -s v1/test_metrics_reader.py - # TODO: accuracy does not match, whether setting - # VLLM_USE_FLASHINFER_SAMPLER or not on H100. - - pytest -v -s v1/e2e # Integration test for streaming correctness (requires special branch). - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine -- label: Examples Test # 25min +- label: Examples Test # 30min + timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/examples" source_file_dependencies: @@ -281,14 +336,14 @@ steps: - python3 offline_inference/vision_language_pooling.py --seed 0 - python3 offline_inference/vision_language_multi_image.py --seed 0 - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/encoder_decoder.py - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - python3 offline_inference/basic/classify.py - python3 offline_inference/basic/embed.py - python3 offline_inference/basic/score.py - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 -- label: Platform Tests (CUDA) +- label: Platform Tests (CUDA) # 4min + timeout_in_minutes: 15 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ @@ -296,7 +351,8 @@ steps: commands: - pytest -v -s cuda/test_cuda_context.py -- label: Samplers Test # 36min +- label: Samplers Test # 56min + timeout_in_minutes: 75 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/model_executor/layers @@ -307,15 +363,23 @@ steps: - pytest -v -s samplers - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers -- label: LoRA Test %N # 15min each +- label: LoRA Test %N # 20min each + timeout_in_minutes: 30 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/lora - tests/lora - command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py + commands: + - pytest -v -s lora \ + --shard-id=$$BUILDKITE_PARALLEL_JOB \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --ignore=lora/test_chatglm3_tp.py \ + --ignore=lora/test_llama_tp.py \ + --ignore=lora/test_llm_with_multi_loras.py parallelism: 4 -- label: PyTorch Compilation Unit Tests +- label: PyTorch Compilation Unit Tests # 15min + timeout_in_minutes: 30 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: @@ -331,7 +395,8 @@ steps: - pytest -v -s compile/test_fusion_all_reduce.py - pytest -v -s compile/test_decorator.py -- label: PyTorch Fullgraph Smoke Test # 9min +- label: PyTorch Fullgraph Smoke Test # 15min + timeout_in_minutes: 30 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: @@ -339,13 +404,10 @@ steps: - tests/compile commands: - pytest -v -s compile/test_basic_correctness.py - # these tests need to be separated, cannot combine - - pytest -v -s compile/piecewise/test_simple.py - - pytest -v -s compile/piecewise/test_toy_llama.py - - pytest -v -s compile/piecewise/test_full_cudagraph.py - - pytest -v -s compile/piecewise/test_multiple_graphs.py + - pytest -v -s compile/piecewise/ -- label: PyTorch Fullgraph Test # 18min +- label: PyTorch Fullgraph Test # 20min + timeout_in_minutes: 30 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: @@ -354,7 +416,8 @@ steps: commands: - pytest -v -s compile/test_full_graph.py -- label: Kernels Core Operation Test +- label: Kernels Core Operation Test # 48min + timeout_in_minutes: 75 mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/ @@ -362,7 +425,8 @@ steps: commands: - pytest -v -s kernels/core -- label: Kernels Attention Test %N +- label: Kernels Attention Test %N # 23min + timeout_in_minutes: 35 mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/attention/ @@ -373,7 +437,8 @@ steps: - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT parallelism: 2 -- label: Kernels Quantization Test %N +- label: Kernels Quantization Test %N # 64min + timeout_in_minutes: 90 mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/quantization/ @@ -383,7 +448,8 @@ steps: - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT parallelism: 2 -- label: Kernels MoE Test %N +- label: Kernels MoE Test %N # 40min + timeout_in_minutes: 60 mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/quantization/cutlass_w8a8/moe/ @@ -395,7 +461,8 @@ steps: - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT parallelism: 2 -- label: Kernels Mamba Test +- label: Kernels Mamba Test # 31min + timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/mamba/ @@ -403,7 +470,8 @@ steps: commands: - pytest -v -s kernels/mamba -- label: Tensorizer Test # 11min +- label: Tensorizer Test # 14min + timeout_in_minutes: 25 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/model_executor/model_loader @@ -415,7 +483,8 @@ steps: - pytest -v -s tensorizer_loader - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py -- label: Model Executor Test +- label: Model Executor Test # 7min + timeout_in_minutes: 20 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/model_executor @@ -425,7 +494,8 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s model_executor -- label: Benchmarks # 9min +- label: Benchmarks # 11min + timeout_in_minutes: 20 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/.buildkite" source_file_dependencies: @@ -433,7 +503,8 @@ steps: commands: - bash scripts/run-benchmarks.sh -- label: Benchmarks CLI Test # 10min +- label: Benchmarks CLI Test # 7min + timeout_in_minutes: 20 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ @@ -441,7 +512,8 @@ steps: commands: - pytest -v -s benchmarks/ -- label: Quantization Test +- label: Quantization Test # 70min + timeout_in_minutes: 90 mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/ @@ -449,11 +521,16 @@ steps: - tests/quantization commands: # temporary install here since we need nightly, will move to requirements/test.in - # after torchao 0.12 release - - pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 + # after torchao 0.12 release, and pin a working version of torchao nightly here + + # since torchao nightly is only compatible with torch nightly currently + # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now + # we can only upgrade after this is resolved + - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128 - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization - label: LM Eval Small Models # 53min + timeout_in_minutes: 75 mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/ @@ -461,7 +538,8 @@ steps: commands: - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 -- label: OpenAI API correctness +- label: OpenAI API correctness # 22min + timeout_in_minutes: 30 mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/ @@ -470,7 +548,8 @@ steps: commands: # LMEval+Transcription WER check - pytest -s entrypoints/openai/correctness/ -- label: Encoder Decoder tests # 5min +- label: Encoder Decoder tests # 12min + timeout_in_minutes: 20 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ @@ -478,7 +557,8 @@ steps: commands: - pytest -v -s encoder_decoder -- label: OpenAI-Compatible Tool Use # 20 min +- label: OpenAI-Compatible Tool Use # 23 min + timeout_in_minutes: 35 mirror_hardwares: [amdexperimental] fast_check: false source_file_dependencies: @@ -491,7 +571,8 @@ steps: ##### models test ##### -- label: Basic Models Test # 24min +- label: Basic Models Test # 57min + timeout_in_minutes: 75 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: @@ -504,7 +585,8 @@ steps: - pytest -v -s models/test_vision.py - pytest -v -s models/test_initialization.py -- label: Language Models Test (Standard) +- label: Language Models Test (Standard) # 35min + timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: @@ -515,6 +597,7 @@ steps: - pytest -v -s models/language -m core_model - label: Language Models Test (Hybrid) # 35 min + timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: @@ -527,7 +610,8 @@ steps: - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' - pytest -v -s models/language/generation -m hybrid_model -- label: Language Models Test (Extended Generation) # 1hr20min +- label: Language Models Test (Extended Generation) # 80min + timeout_in_minutes: 110 mirror_hardwares: [amdexperimental] optional: true source_file_dependencies: @@ -538,7 +622,18 @@ steps: - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' +- label: Language Models Test (PPL) + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental] + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation_ppl_test + commands: + - pytest -v -s models/language/generation_ppl_test + - label: Language Models Test (Extended Pooling) # 36min + timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] optional: true source_file_dependencies: @@ -547,16 +642,27 @@ steps: commands: - pytest -v -s models/language/pooling -m 'not core_model' -- label: Multi-Modal Processor Test +- label: Language Models Test (MTEB) + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental] + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/pooling_mteb_test + commands: + - pytest -v -s models/language/pooling_mteb_test + +- label: Multi-Modal Processor Test # 44min + timeout_in_minutes: 60 source_file_dependencies: - vllm/ - tests/models/multimodal commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py - - pytest -v -s models/multimodal/processing/test_tensor_schema.py + - pytest -v -s models/multimodal/processing -- label: Multi-Modal Models Test (Standard) +- label: Multi-Modal Models Test (Standard) # 60min + timeout_in_minutes: 80 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: @@ -566,7 +672,7 @@ steps: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip freeze | grep -E 'torch' - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing - - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work + - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work - label: Multi-Modal Models Test (Extended) 1 mirror_hardwares: [amdexperimental] @@ -598,7 +704,8 @@ steps: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' -- label: Quantized Models Test +- label: Quantized Models Test # 45 min + timeout_in_minutes: 60 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/model_executor/layers/quantization @@ -628,7 +735,8 @@ steps: - python3 examples/offline_inference/audio_language.py --model-type whisper - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl -- label: Blackwell Test +- label: Blackwell Test # 38 min + timeout_in_minutes: 60 working_dir: "/vllm-workspace/" gpu: b200 # optional: true @@ -650,10 +758,12 @@ steps: # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py - - pytest -v -s tests/kernels/test_cutlass_mla_decode.py + - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py + - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py # Quantization - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py + - pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py @@ -663,11 +773,13 @@ steps: - pytest -v -s tests/compile/test_fusion_all_reduce.py - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern - pytest -v -s tests/kernels/moe/test_flashinfer.py + - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py ##### 1 GPU test ##### ##### multi gpus test ##### - label: Distributed Comm Ops Test # 7min + timeout_in_minutes: 20 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 2 @@ -679,6 +791,7 @@ steps: - pytest -v -s distributed/test_shm_broadcast.py - label: 2 Node Tests (4 GPUs in total) # 16min + timeout_in_minutes: 30 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 2 @@ -702,7 +815,8 @@ steps: - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code -- label: Distributed Tests (2 GPUs) # 40min +- label: Distributed Tests (2 GPUs) # 110min + timeout_in_minutes: 150 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 2 @@ -733,7 +847,8 @@ steps: # Avoid importing model tests that cause CUDA reinitialization error - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' - pytest models/language -v -s -m 'distributed(num_gpus=2)' - - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' + - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py + - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)' # test sequence parallel - pytest -v -s distributed/test_sequence_parallel.py # this test fails consistently. @@ -743,6 +858,7 @@ steps: - pytest -v -s models/multimodal/generation/test_maverick.py - label: Plugin Tests (2 GPUs) # 40min + timeout_in_minutes: 60 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 2 @@ -755,6 +871,11 @@ steps: - pytest -v -s plugins_tests/test_platform_plugins.py - pip uninstall vllm_add_dummy_platform -y # end platform plugin tests + # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin + - pip install -e ./plugins/prithvi_io_processor_plugin + - pytest -v -s plugins_tests/test_io_processor_plugins.py + - pip uninstall prithvi_io_processor_plugin -y + # end io_processor plugins test # other tests continue here: - pytest -v -s plugins_tests/test_scheduler_plugins.py - pip install -e ./plugins/vllm_add_dummy_model @@ -763,7 +884,8 @@ steps: - pytest -v -s models/test_oot_registration.py # it needs a clean process - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins -- label: Pipeline Parallelism Test # 45min +- label: Pipeline + Context Parallelism Test # 45min + timeout_in_minutes: 60 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 4 @@ -776,8 +898,10 @@ steps: commands: - pytest -v -s distributed/test_pp_cudagraph.py - pytest -v -s distributed/test_pipeline_parallel.py + # - pytest -v -s distributed/test_context_parallel.py # TODO: enable it on Hopper runners or add triton MLA support -- label: LoRA TP Test (Distributed) +- label: LoRA TP Test (Distributed) # 17 min + timeout_in_minutes: 30 mirror_hardwares: [amdexperimental] num_gpus: 4 source_file_dependencies: @@ -791,13 +915,15 @@ steps: # requires multi-GPU testing for validation. - pytest -v -s -x lora/test_chatglm3_tp.py - pytest -v -s -x lora/test_llama_tp.py - - pytest -v -s -x lora/test_multi_loras_with_tp.py + - pytest -v -s -x lora/test_llm_with_multi_loras.py - label: Weight Loading Multiple GPU Test # 33min + timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 2 + optional: true source_file_dependencies: - vllm/ - tests/weight_loading diff --git a/.github/.bc-linter.yml b/.github/.bc-linter.yml new file mode 100644 index 000000000..443dfa45a --- /dev/null +++ b/.github/.bc-linter.yml @@ -0,0 +1,24 @@ +# doc: https://github.com/pytorch/test-infra/blob/main/tools/stronghold/docs/bc_linter_config.md +version: 1 +paths: +# We temporarily disable globally, and will only enable with `annotations.include` +# include: +# - "vllm/v1/attetion/*.py" +# - "vllm/v1/core/*.py" +exclude: + - "**/*.py" + +scan: + functions: true # check free functions and methods + classes: true # check classes/dataclasses + public_only: true # ignore names starting with "_" at any level + +annotations: + include: # decorators that force‑include a symbol + - name: "bc_linter_include" # matched by simple name or dotted suffix + propagate_to_members: false # for classes, include methods/inner classes + exclude: # decorators that force‑exclude a symbol + - name: "bc_linter_skip" # matched by simple name or dotted suffix + propagate_to_members: true # for classes, exclude methods/inner classes + +excluded_violations: [] # e.g. ["ParameterRenamed", "FieldTypeChanged"] diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index c087fd555..846b68054 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -5,18 +5,21 @@ /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill -/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill -/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill +/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn +/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill -/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill +/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 /vllm/model_executor/layers/mamba @tdoublep -/vllm/multimodal @DarkLight1337 @ywang96 +/vllm/model_executor/model_loader @22quinn +/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche +/vllm/v1/sample @22quinn @houseroad /vllm/vllm_flash_attn @LucasWilkinson /vllm/lora @jeejeelee -/vllm/reasoning @aarnphm -/vllm/entrypoints @aarnphm +/vllm/reasoning @aarnphm @chaunceyjiang +/vllm/entrypoints @aarnphm @chaunceyjiang /vllm/compilation @zou3519 @youkaichao @ProExpertProg +/vllm/distributed/kv_transfer @NickLucche CMakeLists.txt @tlrmchlsmth @LucasWilkinson # Any change to the VllmConfig changes can have a large user-facing impact, @@ -25,8 +28,11 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson # vLLM V1 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat -/vllm/v1/structured_output @mgoin @russellb @aarnphm +/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett +/vllm/v1/spec_decode @benchislett @luccafong /vllm/v1/attention/backends/triton_attn.py @tdoublep +/vllm/v1/core @heheda12345 +/vllm/v1/kv_cache_interface.py @heheda12345 # Test ownership /.buildkite/lm-eval-harness @mgoin @simon-mo @@ -34,18 +40,20 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /tests/distributed/test_multi_node_assignment.py @youkaichao /tests/distributed/test_pipeline_parallel.py @youkaichao /tests/distributed/test_same_node.py @youkaichao -/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm +/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche /tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256 /tests/models @DarkLight1337 @ywang96 -/tests/multimodal @DarkLight1337 @ywang96 +/tests/multimodal @DarkLight1337 @ywang96 @NickLucche /tests/prefix_caching @comaniac @KuntaiDu /tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 /tests/test_inputs.py @DarkLight1337 @ywang96 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm /tests/v1/structured_output @mgoin @russellb @aarnphm +/tests/v1/core @heheda12345 /tests/weight_loading @mgoin @youkaichao @yewentao256 /tests/lora @jeejeelee /tests/models/language/generation/test_hybrid.py @tdoublep +/tests/v1/kv_connector/nixl_integration @NickLucche # Docs /docs @hmellor @@ -67,6 +75,9 @@ mkdocs.yaml @hmellor /vllm/attention/backends/dual_chunk_flash_attn.py @sighingnow /vllm/model_executor/models/qwen* @sighingnow +# MTP-specific files +/vllm/model_executor/models/deepseek_mtp.py @luccafong + # Mistral-specific files /vllm/model_executor/models/mistral*.py @patrickvonplaten /vllm/model_executor/models/mixtral*.py @patrickvonplaten @@ -86,3 +97,8 @@ mkdocs.yaml @hmellor /vllm/attention/ops/rocm*.py @gshtras /vllm/model_executor/layers/fused_moe/rocm*.py @gshtras +# TPU +/vllm/v1/worker/tpu* @NickLucche +/vllm/platforms/tpu.py @NickLucche +/vllm/v1/sample/tpu @NickLucche +/vllm/tests/v1/tpu @NickLucche \ No newline at end of file diff --git a/.github/mergify.yml b/.github/mergify.yml index 495d207d4..f2dd2e062 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -124,9 +124,16 @@ pull_request_rules: - or: - files~=^examples/.*gpt[-_]?oss.*\.py - files~=^tests/.*gpt[-_]?oss.*\.py + - files~=^tests/entrypoints/openai/test_response_api_with_harmony.py + - files~=^tests/entrypoints/test_context.py - files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py - files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py + - files~=^vllm/entrypoints/harmony_utils.py + - files~=^vllm/entrypoints/tool_server.py + - files~=^vllm/entrypoints/tool.py + - files~=^vllm/entrypoints/context.py - title~=(?i)gpt[-_]?oss + - title~=(?i)harmony actions: label: add: @@ -273,6 +280,20 @@ pull_request_rules: users: - "sangstar" +- name: assign reviewer for modelopt changes + conditions: + - or: + - files~=^vllm/model_executor/layers/quantization/modelopt\.py$ + - files~=^vllm/model_executor/layers/quantization/__init__\.py$ + - files~=^tests/models/quantization/test_modelopt\.py$ + - files~=^tests/quantization/test_modelopt\.py$ + - files~=^tests/models/quantization/test_nvfp4\.py$ + - files~=^docs/features/quantization/modelopt\.md$ + actions: + assign: + users: + - "Edwardf0t1" + - name: remove 'needs-rebase' label when conflict is resolved conditions: - -conflict diff --git a/.github/scale-config.yml b/.github/scale-config.yml new file mode 100644 index 000000000..c41a3ee3e --- /dev/null +++ b/.github/scale-config.yml @@ -0,0 +1,21 @@ +# scale-config.yml: +# Powers what instance types are available for GHA auto-scaled +# runners. Runners listed here will be available as self hosted +# runners, configuration is directly pulled from the main branch. +# runner_types: +# runner_label: +# instance_type: m4.large +# os: linux +# # min_available defaults to the global cfg in the ALI Terraform +# min_available: undefined +# # when max_available value is not defined, no max runners is enforced +# max_available: undefined +# disk_size: 50 +# is_ephemeral: true + +runner_types: + linux.2xlarge: + disk_size: 150 + instance_type: c5.2xlarge + is_ephemeral: true + os: linux diff --git a/.github/workflows/add_label_automerge.yml b/.github/workflows/add_label_automerge.yml index 315042fbf..d8bbedef3 100644 --- a/.github/workflows/add_label_automerge.yml +++ b/.github/workflows/add_label_automerge.yml @@ -10,7 +10,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Add label - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 with: script: | github.rest.issues.addLabels({ diff --git a/.github/workflows/bc-lint.yml b/.github/workflows/bc-lint.yml new file mode 100644 index 000000000..3795b046d --- /dev/null +++ b/.github/workflows/bc-lint.yml @@ -0,0 +1,27 @@ +name: BC Lint + +on: + pull_request: + types: + - opened + - synchronize + - reopened + +jobs: + bc_lint: + if: github.repository_owner == 'vllm-project' + runs-on: ubuntu-latest + steps: + - name: Run BC Lint Action + uses: pytorch/test-infra/.github/actions/bc-lint@main + with: + repo: ${{ github.event.pull_request.head.repo.full_name }} + base_sha: ${{ github.event.pull_request.base.sha }} + head_sha: ${{ github.event.pull_request.head.sha }} + suppression: ${{ contains(github.event.pull_request.labels.*.name, 'suppress-bc-linter') }} + docs_link: 'https://github.com/pytorch/test-infra/wiki/BC-Linter' + config_dir: .github + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }} + cancel-in-progress: true diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml index d5c6b8d43..c3e132a53 100644 --- a/.github/workflows/cleanup_pr_body.yml +++ b/.github/workflows/cleanup_pr_body.yml @@ -16,7 +16,7 @@ jobs: uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Set up Python - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: python-version: '3.12' diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml index 6401d6586..c2b17abe8 100644 --- a/.github/workflows/issue_autolabel.yml +++ b/.github/workflows/issue_autolabel.yml @@ -13,7 +13,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Label issues based on keywords - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 with: script: | // Configuration: Add new labels and keywords here @@ -49,6 +49,10 @@ jobs: term: "VLLM_ROCM_", searchIn: "both" }, + { + term: "aiter", + searchIn: "title" + }, { term: "rocm", searchIn: "title" diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 195579f20..e21d13b81 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -17,7 +17,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 + - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: python-version: "3.12" - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json" diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml index 1ee605dc7..8884359fa 100644 --- a/.github/workflows/reminder_comment.yml +++ b/.github/workflows/reminder_comment.yml @@ -9,7 +9,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Remind to run full CI on PR - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 with: script: | try { diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index 656f3d3fa..82844810a 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -13,7 +13,7 @@ jobs: actions: write runs-on: ubuntu-latest steps: - - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0 + - uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0 with: # Increasing this value ensures that changes to this workflow # propagate to all issues and PRs in days rather than months diff --git a/.gitignore b/.gitignore index 465935d48..b1df673e8 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,7 @@ # vllm-flash-attn built from source vllm/vllm_flash_attn/* -# triton jit +# triton jit .triton # Byte-compiled / optimized / DLL files @@ -177,6 +177,14 @@ cython_debug/ # VSCode .vscode/ +# Claude +CLAUDE.md +.claude/ + +# Codex +AGENTS.md +.codex/ + # DS Store .DS_Store @@ -209,4 +217,4 @@ shellcheck*/ csrc/moe/marlin_moe_wna16/kernel_* # Ignore ep_kernels_workspace folder -ep_kernels_workspace/ \ No newline at end of file +ep_kernels_workspace/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 612b290e8..c16bdeeec 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,7 +21,7 @@ repos: - id: ruff-format files: ^(.buildkite|benchmarks|examples)/.* - repo: https://github.com/crate-ci/typos - rev: v1.34.0 + rev: v1.35.5 hooks: - id: typos - repo: https://github.com/PyCQA/isort diff --git a/.yapfignore b/.yapfignore index 2d6dcf838..381582590 100644 --- a/.yapfignore +++ b/.yapfignore @@ -1 +1,2 @@ collect_env.py +vllm/model_executor/layers/fla/ops/*.py diff --git a/CMakeLists.txt b/CMakeLists.txt index fb645b183..1e58ebb55 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -45,8 +45,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1 # requirements.txt files and should be kept consistent. The ROCm torch # versions are derived from docker/Dockerfile.rocm # -set(TORCH_SUPPORTED_VERSION_CUDA "2.7.1") -set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0") +set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0") +set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0") # # Try to find python package with an executable that exactly matches @@ -542,6 +542,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS) set(SRCS "csrc/quantization/fp4/nvfp4_quant_kernels.cu" + "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu" "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu") set_gencode_flags_for_srcs( SRCS "${SRCS}" @@ -560,6 +561,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS) set(SRCS "csrc/quantization/fp4/nvfp4_quant_kernels.cu" + "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu" "csrc/quantization/fp4/nvfp4_experts_quant.cu" "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu" "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu") diff --git a/MANIFEST.in b/MANIFEST.in index 82fd22b84..fb3cccbb4 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,7 +2,6 @@ include LICENSE include requirements/common.txt include requirements/cuda.txt include requirements/rocm.txt -include requirements/neuron.txt include requirements/cpu.txt include CMakeLists.txt diff --git a/README.md b/README.md index 8812aac4e..b4a3583c2 100644 --- a/README.md +++ b/README.md @@ -14,19 +14,24 @@ Easy, fast, and cheap LLM serving for everyone | Documentation | Blog | Paper | Twitter/X | User Forum | Developer Slack |

+--- +Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundation.org/pytorch-conference/) and [Ray Summit, November 3-5](https://www.anyscale.com/ray-summit/2025) in San Francisco for our latest updates on vLLM and to meet the vLLM team! Register now for the largest vLLM community events of the year! + --- *Latest News* 🔥 +- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA). +- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing). - [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH). -- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view). -- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152). - [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/). - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
Previous News +- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view). +- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152). - [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing). - [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing). - [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing). diff --git a/benchmarks/README.md b/benchmarks/README.md index 38072152b..ee1726420 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -95,6 +95,24 @@ become available. ✅ lmms-lab/LLaVA-OneVision-Data, Aeala/ShareGPT_Vicuna_unfiltered + + HuggingFace-MTBench + ✅ + ✅ + philschmid/mt-bench + + + HuggingFace-Blazedit + ✅ + ✅ + vdaita/edit_5k_char, vdaita/edit_10k_char + + + Spec Bench + ✅ + ✅ + wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl + Custom ✅ @@ -110,7 +128,12 @@ become available. 🚧: to be supported -**Note**: HuggingFace dataset's `dataset-name` should be set to `hf` +**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`. +For local `dataset-path`, please set `hf-name` to its Hugging Face ID like + +```bash +--dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat +``` ## 🚀 Example - Online Benchmark @@ -234,6 +257,43 @@ vllm bench serve \ --num-prompts 2048 ``` +### Spec Bench Benchmark with Speculative Decoding + +``` bash +VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \ + --speculative-config $'{"method": "ngram", + "num_speculative_tokens": 5, "prompt_lookup_max": 5, + "prompt_lookup_min": 2}' +``` + +[SpecBench dataset](https://github.com/hemingkx/Spec-Bench) + +Run all categories: + +``` bash +# Download the dataset using: +# wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl + +vllm bench serve \ + --model meta-llama/Meta-Llama-3-8B-Instruct \ + --dataset-name spec_bench \ + --dataset-path "/data/spec_bench/question.jsonl" \ + --num-prompts -1 +``` + +Available categories include `[writing, roleplay, reasoning, math, coding, extraction, stem, humanities, translation, summarization, qa, math_reasoning, rag]`. + +Run only a specific category like "summarization": + +``` bash +vllm bench serve \ + --model meta-llama/Meta-Llama-3-8B-Instruct \ + --dataset-name spec_bench \ + --dataset-path "/data/spec_bench/question.jsonl" \ + --num-prompts -1 + --spec-bench-category "summarization" +``` + ### Other HuggingFaceDataset Examples ```bash @@ -290,6 +350,18 @@ vllm bench serve \ --num-prompts 80 ``` +`vdaita/edit_5k_char` or `vdaita/edit_10k_char`: + +``` bash +vllm bench serve \ + --model Qwen/QwQ-32B \ + --dataset-name hf \ + --dataset-path vdaita/edit_5k_char \ + --num-prompts 90 \ + --blazedit-min-distance 0.01 \ + --blazedit-max-distance 0.99 +``` + ### Running With Sampling Parameters When using OpenAI-compatible backends such as `vllm`, optional sampling @@ -689,7 +761,7 @@ python -m vllm.entrypoints.openai.api_server \ Send requests with images: ```bash -python benchmarks/benchmark_serving.py \ +vllm bench serve \ --backend openai-chat \ --model Qwen/Qwen2.5-VL-7B-Instruct \ --dataset-name sharegpt \ @@ -716,7 +788,7 @@ python -m vllm.entrypoints.openai.api_server \ Send requests with videos: ```bash -python benchmarks/benchmark_serving.py \ +vllm bench serve \ --backend openai-chat \ --model Qwen/Qwen2.5-VL-7B-Instruct \ --dataset-name sharegpt \ diff --git a/benchmarks/auto_tune/README.md b/benchmarks/auto_tune/README.md index 9aad51df6..3aa988aac 100644 --- a/benchmarks/auto_tune/README.md +++ b/benchmarks/auto_tune/README.md @@ -31,6 +31,12 @@ cd vllm You must set the following variables at the top of the script before execution. + Note: You can also override the default values below via environment variables when running the script. + +```bash +MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LEN=128 OUTPUT_LEN=2048 MAX_MODEL_LEN=2300 MIN_CACHE_HIT_PCT=0 MAX_LATENCY_ALLOWED_MS=100000000000 NUM_SEQS_LIST="128 256" NUM_BATCHED_TOKENS_LIST="1024 2048 4096" VLLM_LOGGING_LEVEL=DEBUG bash auto_tune.sh +``` + | Variable | Description | Example Value | | --- | --- | --- | | `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` | diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh index 82c20ffa6..ed3679b66 100644 --- a/benchmarks/auto_tune/auto_tune.sh +++ b/benchmarks/auto_tune/auto_tune.sh @@ -5,25 +5,41 @@ TAG=$(date +"%Y_%m_%d_%H_%M") SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) -BASE="$SCRIPT_DIR/../../.." -MODEL="meta-llama/Llama-3.1-8B-Instruct" -SYSTEM="TPU" -TP=1 -DOWNLOAD_DIR="" -INPUT_LEN=4000 -OUTPUT_LEN=16 -MAX_MODEL_LEN=4096 -MIN_CACHE_HIT_PCT=0 -MAX_LATENCY_ALLOWED_MS=100000000000 -NUM_SEQS_LIST="128 256" -NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096" +VLLM_LOGGING_LEVEL=${VLLM_LOGGING_LEVEL:-INFO} +BASE=${BASE:-"$SCRIPT_DIR/../../.."} +MODEL=${MODEL:-"meta-llama/Llama-3.1-8B-Instruct"} +SYSTEM=${SYSTEM:-"TPU"} +TP=${TP:-1} +DOWNLOAD_DIR=${DOWNLOAD_DIR:-""} +INPUT_LEN=${INPUT_LEN:-4000} +OUTPUT_LEN=${OUTPUT_LEN:-16} +MAX_MODEL_LEN=${MAX_MODEL_LEN:-4096} +MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0} +MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000} +NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"} +NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"} LOG_FOLDER="$BASE/auto-benchmark/$TAG" RESULT="$LOG_FOLDER/result.txt" PROFILE_PATH="$LOG_FOLDER/profile" -echo "result file: $RESULT" -echo "model: $MODEL" +echo "====================== AUTO TUNE PARAMETERS ====================" +echo "SCRIPT_DIR=$SCRIPT_DIR" +echo "BASE=$BASE" +echo "MODEL=$MODEL" +echo "SYSTEM=$SYSTEM" +echo "TP=$TP" +echo "DOWNLOAD_DIR=$DOWNLOAD_DIR" +echo "INPUT_LEN=$INPUT_LEN" +echo "OUTPUT_LEN=$OUTPUT_LEN" +echo "MAX_MODEL_LEN=$MAX_MODEL_LEN" +echo "MIN_CACHE_HIT_PCT=$MIN_CACHE_HIT_PCT" +echo "MAX_LATENCY_ALLOWED_MS=$MAX_LATENCY_ALLOWED_MS" +echo "NUM_SEQS_LIST=$NUM_SEQS_LIST" +echo "NUM_BATCHED_TOKENS_LIST=$NUM_BATCHED_TOKENS_LIST" +echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL" +echo "RESULT_FILE=$RESULT" +echo "====================== AUTO TUNEPARAMETERS ====================" rm -rf $LOG_FOLDER rm -rf $PROFILE_PATH @@ -213,7 +229,7 @@ run_benchmark() { pkill -if vllm sleep 10 - printf '=%.0s' $(seq 1 20) + echo "====================" return 0 } diff --git a/benchmarks/benchmark_block_pool.py b/benchmarks/benchmark_block_pool.py index fd363c2ad..eae8d9927 100644 --- a/benchmarks/benchmark_block_pool.py +++ b/benchmarks/benchmark_block_pool.py @@ -57,7 +57,7 @@ def invoke_main() -> None: "--num-iteration", type=int, default=1000, - help="Number of iterations to run to stablize final data readings", + help="Number of iterations to run to stabilize final data readings", ) parser.add_argument( "--allocate-blocks", diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index 2ea4f9cca..64ffa62c0 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -403,7 +403,7 @@ class RandomDataset(BenchmarkDataset): # [6880, 6881] -> ['Ġcalls', 'here'] -> # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere'] # To avoid uncontrolled change of the prompt length, - # the encoded sequence is truncated before being decode again. + # the encoded sequence is truncated before being decoded again. total_input_len = prefix_len + int(input_lens[i]) re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[ :total_input_len diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index d8b960eda..a7892f3f7 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -1,191 +1,17 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Benchmark the latency of processing a single batch of requests.""" - -import argparse -import dataclasses -import json -import os -import time -from typing import Any, Optional - -import numpy as np -from tqdm import tqdm -from typing_extensions import deprecated - -import vllm.envs as envs -from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json -from vllm import LLM, SamplingParams -from vllm.engine.arg_utils import EngineArgs -from vllm.inputs import PromptType -from vllm.sampling_params import BeamSearchParams -from vllm.utils import FlexibleArgumentParser - - -def save_to_pytorch_benchmark_format( - args: argparse.Namespace, results: dict[str, Any] -) -> None: - pt_records = convert_to_pytorch_benchmark_format( - args=args, - metrics={"latency": results["latencies"]}, - extra_info={k: results[k] for k in ["avg_latency", "percentiles"]}, - ) - if pt_records: - pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json" - write_to_json(pt_file, pt_records) - - -@deprecated( - "benchmark_latency.py is deprecated and will be removed in a " - "future version. Please use 'vllm bench latency' instead.", -) -def main(args: argparse.Namespace): - print(args) - - engine_args = EngineArgs.from_cli_args(args) - - # NOTE(woosuk): If the request cannot be processed in a single batch, - # the engine will automatically process the request in multiple batches. - llm = LLM(**dataclasses.asdict(engine_args)) - assert llm.llm_engine.model_config.max_model_len >= ( - args.input_len + args.output_len - ), ( - "Please ensure that max_model_len is greater than" - " the sum of input_len and output_len." - ) - - sampling_params = SamplingParams( - n=args.n, - temperature=1.0, - top_p=1.0, - ignore_eos=True, - max_tokens=args.output_len, - detokenize=not args.disable_detokenize, - ) - print(sampling_params) - dummy_prompt_token_ids = np.random.randint( - 10000, size=(args.batch_size, args.input_len) - ) - dummy_prompts: list[PromptType] = [ - {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist() - ] - - def llm_generate(): - if not args.use_beam_search: - llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False) - else: - llm.beam_search( - dummy_prompts, - BeamSearchParams( - beam_width=args.n, - max_tokens=args.output_len, - ignore_eos=True, - ), - ) - - def run_to_completion(profile_dir: Optional[str] = None): - if profile_dir: - llm.start_profile() - llm_generate() - llm.stop_profile() - else: - start_time = time.perf_counter() - llm_generate() - end_time = time.perf_counter() - latency = end_time - start_time - return latency - - print("Warming up...") - for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"): - run_to_completion(profile_dir=None) - - if args.profile: - profile_dir = envs.VLLM_TORCH_PROFILER_DIR - print(f"Profiling (results will be saved to '{profile_dir}')...") - run_to_completion(profile_dir=profile_dir) - return - - # Benchmark. - latencies = [] - for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): - latencies.append(run_to_completion(profile_dir=None)) - latencies = np.array(latencies) - percentages = [10, 25, 50, 75, 90, 99] - percentiles = np.percentile(latencies, percentages) - print(f"Avg latency: {np.mean(latencies)} seconds") - for percentage, percentile in zip(percentages, percentiles): - print(f"{percentage}% percentile latency: {percentile} seconds") - - # Output JSON results if specified - if args.output_json: - results = { - "avg_latency": np.mean(latencies), - "latencies": latencies.tolist(), - "percentiles": dict(zip(percentages, percentiles.tolist())), - } - with open(args.output_json, "w") as f: - json.dump(results, f, indent=4) - save_to_pytorch_benchmark_format(args, results) - - -def create_argument_parser(): - parser = FlexibleArgumentParser( - description="Benchmark the latency of processing a single batch of " - "requests till completion." - ) - parser.add_argument("--input-len", type=int, default=32) - parser.add_argument("--output-len", type=int, default=128) - parser.add_argument("--batch-size", type=int, default=8) - parser.add_argument( - "--n", - type=int, - default=1, - help="Number of generated sequences per prompt.", - ) - parser.add_argument("--use-beam-search", action="store_true") - parser.add_argument( - "--num-iters-warmup", - type=int, - default=10, - help="Number of iterations to run for warmup.", - ) - parser.add_argument( - "--num-iters", type=int, default=30, help="Number of iterations to run." - ) - parser.add_argument( - "--profile", - action="store_true", - help="profile the generation process of a single batch", - ) - parser.add_argument( - "--output-json", - type=str, - default=None, - help="Path to save the latency results in JSON format.", - ) - parser.add_argument( - "--disable-detokenize", - action="store_true", - help=( - "Do not detokenize responses (i.e. do not include " - "detokenization time in the latency measurement)" - ), - ) - - parser = EngineArgs.add_cli_args(parser) - # V1 enables prefix caching by default which skews the latency - # numbers. We need to disable prefix caching by default. - parser.set_defaults(enable_prefix_caching=False) - - return parser - +import sys if __name__ == "__main__": - parser = create_argument_parser() - args = parser.parse_args() - if args.profile and not envs.VLLM_TORCH_PROFILER_DIR: - raise OSError( - "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. " - "Please set it to a valid path to use torch profiler." - ) - main(args) + print("""DEPRECATED: This script has been moved to the vLLM CLI. + +Please use the following command instead: + vllm bench latency + +For help with the new command, run: + vllm bench latency --help + +Alternatively, you can run the new command directly with: + python -m vllm.entrypoints.cli.main bench latency --help +""") + sys.exit(1) diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py index c60040d05..11833fa1b 100644 --- a/benchmarks/benchmark_ngram_proposer.py +++ b/benchmarks/benchmark_ngram_proposer.py @@ -77,7 +77,7 @@ def invoke_main() -> None: "--num-iteration", type=int, default=100, - help="Number of iterations to run to stablize final data readings", + help="Number of iterations to run to stabilize final data readings", ) parser.add_argument( "--num-req", type=int, default=128, help="Number of requests in the batch" diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 02f5f585c..76cf51498 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -1,1324 +1,17 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -r"""Benchmark online serving throughput. - -On the server side, run one of the following commands: - vLLM OpenAI API server - vllm serve \ - --swap-space 16 - -On the client side, run: - python benchmarks/benchmark_serving.py \ - --backend \ - --model \ - --dataset-name sharegpt \ - --dataset-path \ - --request-rate \ # By default is inf - --num-prompts # By default is 1000 - - when using tgi backend, add - --endpoint /generate_stream - to the end of the command above. -""" - -import argparse -import asyncio -import gc -import json -import os -import random -import time -import warnings -from collections.abc import Iterable -from dataclasses import dataclass -from datetime import datetime -from typing import Any, Literal, Optional - -import numpy as np -from tqdm.asyncio import tqdm -from transformers import PreTrainedTokenizerBase -from typing_extensions import deprecated - -from backend_request_func import ( - ASYNC_REQUEST_FUNCS, - OPENAI_COMPATIBLE_BACKENDS, - RequestFuncInput, - RequestFuncOutput, -) - -try: - from vllm.transformers_utils.tokenizer import get_tokenizer -except ImportError: - from backend_request_func import get_tokenizer - -try: - from vllm.utils import FlexibleArgumentParser -except ImportError: - from argparse import ArgumentParser as FlexibleArgumentParser - -from benchmark_dataset import ( - AIMODataset, - ASRDataset, - BurstGPTDataset, - ConversationDataset, - CustomDataset, - HuggingFaceDataset, - InstructCoderDataset, - MTBenchDataset, - NextEditPredictionDataset, - RandomDataset, - SampleRequest, - ShareGPTDataset, - SonnetDataset, - VisionArenaDataset, -) -from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json -from vllm.benchmarks.serve import get_request - -MILLISECONDS_TO_SECONDS_CONVERSION = 1000 - - -@dataclass -class BenchmarkMetrics: - completed: int - total_input: int - total_output: int - request_throughput: float - request_goodput: float - output_throughput: float - total_token_throughput: float - mean_ttft_ms: float - median_ttft_ms: float - std_ttft_ms: float - percentiles_ttft_ms: list[tuple[float, float]] - mean_tpot_ms: float - median_tpot_ms: float - std_tpot_ms: float - percentiles_tpot_ms: list[tuple[float, float]] - mean_itl_ms: float - median_itl_ms: float - std_itl_ms: float - percentiles_itl_ms: list[tuple[float, float]] - # E2EL stands for end-to-end latency per request. - # It is the time taken on the client side from sending - # a request to receiving a complete response. - mean_e2el_ms: float - median_e2el_ms: float - std_e2el_ms: float - percentiles_e2el_ms: list[tuple[float, float]] - - -def calculate_metrics( - input_requests: list[SampleRequest], - outputs: list[RequestFuncOutput], - dur_s: float, - tokenizer: PreTrainedTokenizerBase, - selected_percentile_metrics: list[str], - selected_percentiles: list[float], - goodput_config_dict: dict[str, float], -) -> tuple[BenchmarkMetrics, list[int]]: - actual_output_lens: list[int] = [] - total_input = 0 - completed = 0 - good_completed = 0 - itls: list[float] = [] - tpots: list[float] = [] - all_tpots: list[float] = [] - ttfts: list[float] = [] - e2els: list[float] = [] - for i in range(len(outputs)): - if outputs[i].success: - output_len = outputs[i].output_tokens - - if not output_len: - # We use the tokenizer to count the number of output tokens - # for some serving backends instead of looking at - # len(outputs[i].itl) since multiple output tokens may be - # bundled together - # Note : this may inflate the output token count slightly - output_len = len( - tokenizer( - outputs[i].generated_text, add_special_tokens=False - ).input_ids - ) - actual_output_lens.append(output_len) - total_input += input_requests[i].prompt_len - tpot = 0 - if output_len > 1: - latency_minus_ttft = outputs[i].latency - outputs[i].ttft - tpot = latency_minus_ttft / (output_len - 1) - tpots.append(tpot) - # Note: if output_len <= 1, we regard tpot as 0 for goodput - all_tpots.append(tpot) - itls += outputs[i].itl - ttfts.append(outputs[i].ttft) - e2els.append(outputs[i].latency) - completed += 1 - else: - actual_output_lens.append(0) - - if goodput_config_dict: - valid_metrics = [] - slo_values = [] - - if "ttft" in goodput_config_dict: - valid_metrics.append(ttfts) - slo_values.append( - goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION - ) - if "tpot" in goodput_config_dict: - valid_metrics.append(all_tpots) - slo_values.append( - goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION - ) - if "e2el" in goodput_config_dict: - valid_metrics.append(e2els) - slo_values.append( - goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION - ) - - for req_metric in zip(*valid_metrics): - is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)]) - if is_good_req: - good_completed += 1 - - if completed == 0: - warnings.warn( - "All requests failed. This is likely due to a misconfiguration " - "on the benchmark arguments.", - stacklevel=2, - ) - metrics = BenchmarkMetrics( - completed=completed, - total_input=total_input, - total_output=sum(actual_output_lens), - request_throughput=completed / dur_s, - request_goodput=good_completed / dur_s, - output_throughput=sum(actual_output_lens) / dur_s, - total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s, - mean_ttft_ms=np.mean(ttfts or 0) - * 1000, # ttfts is empty if streaming is not supported by backend - std_ttft_ms=np.std(ttfts or 0) * 1000, - median_ttft_ms=np.median(ttfts or 0) * 1000, - percentiles_ttft_ms=[ - (p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles - ], - mean_tpot_ms=np.mean(tpots or 0) * 1000, - std_tpot_ms=np.std(tpots or 0) * 1000, - median_tpot_ms=np.median(tpots or 0) * 1000, - percentiles_tpot_ms=[ - (p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles - ], - mean_itl_ms=np.mean(itls or 0) * 1000, - std_itl_ms=np.std(itls or 0) * 1000, - median_itl_ms=np.median(itls or 0) * 1000, - percentiles_itl_ms=[ - (p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles - ], - mean_e2el_ms=np.mean(e2els or 0) * 1000, - std_e2el_ms=np.std(e2els or 0) * 1000, - median_e2el_ms=np.median(e2els or 0) * 1000, - percentiles_e2el_ms=[ - (p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles - ], - ) - - return metrics, actual_output_lens - - -async def benchmark( - backend: str, - api_url: str, - base_url: str, - model_id: str, - model_name: str, - tokenizer: PreTrainedTokenizerBase, - input_requests: list[SampleRequest], - logprobs: Optional[int], - request_rate: float, - burstiness: float, - disable_tqdm: bool, - profile: bool, - selected_percentile_metrics: list[str], - selected_percentiles: list[float], - ignore_eos: bool, - goodput_config_dict: dict[str, float], - max_concurrency: Optional[int], - lora_modules: Optional[Iterable[str]], - extra_body: Optional[dict], - ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None, - ramp_up_start_rps: Optional[int] = None, - ramp_up_end_rps: Optional[int] = None, -): - if backend in ASYNC_REQUEST_FUNCS: - request_func = ASYNC_REQUEST_FUNCS[backend] - else: - raise ValueError(f"Unknown backend: {backend}") - - print("Starting initial single prompt test run...") - test_prompt, test_prompt_len, test_output_len, test_mm_content = ( - input_requests[0].prompt, - input_requests[0].prompt_len, - input_requests[0].expected_output_len, - input_requests[0].multi_modal_data, - ) - - assert ( - test_mm_content is None - or isinstance(test_mm_content, dict) - or ( - isinstance(test_mm_content, list) - and all(isinstance(item, dict) for item in test_mm_content) - ) - ), "multi_modal_data must be a dict or list[dict]" - test_input = RequestFuncInput( - model=model_id, - model_name=model_name, - prompt=test_prompt, - api_url=api_url, - prompt_len=test_prompt_len, - output_len=test_output_len, - logprobs=logprobs, - multi_modal_content=test_mm_content, - ignore_eos=ignore_eos, - extra_body=extra_body, - ) - - test_output = await request_func(request_func_input=test_input) - if not test_output.success: - raise ValueError( - "Initial test run failed - Please make sure benchmark arguments " - f"are correctly specified. Error: {test_output.error}" - ) - else: - print("Initial test run completed. Starting main benchmark run...") - - if lora_modules: - # For each input request, choose a LoRA module at random. - lora_modules = iter( - [random.choice(lora_modules) for _ in range(len(input_requests))] - ) - - if profile: - print("Starting profiler...") - profile_input = RequestFuncInput( - model=model_id, - model_name=model_name, - prompt=test_prompt, - api_url=base_url + "/start_profile", - prompt_len=test_prompt_len, - output_len=test_output_len, - logprobs=logprobs, - multi_modal_content=test_mm_content, - ignore_eos=ignore_eos, - extra_body=extra_body, - ) - profile_output = await request_func(request_func_input=profile_input) - if profile_output.success: - print("Profiler started") - - distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution" - - if ramp_up_strategy is not None: - print( - f"Traffic ramp-up strategy: {ramp_up_strategy}. Will increase " - f"RPS from {ramp_up_start_rps} to {ramp_up_end_rps} RPS over " - "the duration of the benchmark." - ) - else: - print(f"Traffic request rate: {request_rate} RPS.") - - print(f"Burstiness factor: {burstiness} ({distribution})") - print(f"Maximum request concurrency: {max_concurrency}") - - pbar = None if disable_tqdm else tqdm(total=len(input_requests)) - - # This can be used once the minimum Python version is 3.10 or higher, - # and it will simplify the code in limited_request_func. - # semaphore = (asyncio.Semaphore(max_concurrency) - # if max_concurrency else contextlib.nullcontext()) - semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None - - async def limited_request_func(request_func_input, pbar): - if semaphore is None: - return await request_func(request_func_input=request_func_input, pbar=pbar) - async with semaphore: - return await request_func(request_func_input=request_func_input, pbar=pbar) - - benchmark_start_time = time.perf_counter() - tasks: list[asyncio.Task] = [] - - rps_change_events = [] - last_int_rps = -1 - if ramp_up_strategy is not None and ramp_up_start_rps is not None: - last_int_rps = ramp_up_start_rps - rps_change_events.append( - { - "rps": last_int_rps, - "timestamp": datetime.now().isoformat(), - } - ) - - async for request, current_request_rate in get_request( - input_requests, - request_rate, - burstiness, - ramp_up_strategy, - ramp_up_start_rps, - ramp_up_end_rps, - ): - if ramp_up_strategy is not None: - current_int_rps = int(current_request_rate) - if current_int_rps > last_int_rps: - timestamp = datetime.now().isoformat() - for rps_val in range(last_int_rps + 1, current_int_rps + 1): - rps_change_events.append({"rps": rps_val, "timestamp": timestamp}) - last_int_rps = current_int_rps - - prompt, prompt_len, output_len, mm_content, request_id = ( - request.prompt, - request.prompt_len, - request.expected_output_len, - request.multi_modal_data, - request.request_id, - ) - req_model_id, req_model_name = model_id, model_name - if lora_modules: - req_lora_module = next(lora_modules) - req_model_id, req_model_name = req_lora_module, req_lora_module - - request_func_input = RequestFuncInput( - model=req_model_id, - model_name=req_model_name, - prompt=prompt, - api_url=api_url, - prompt_len=prompt_len, - output_len=output_len, - logprobs=logprobs, - multi_modal_content=mm_content, - ignore_eos=ignore_eos, - extra_body=extra_body, - request_id=request_id, - ) - task = limited_request_func(request_func_input=request_func_input, pbar=pbar) - tasks.append(asyncio.create_task(task)) - outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks) - - if pbar is not None: - pbar.close() - - benchmark_duration = time.perf_counter() - benchmark_start_time - - metrics, actual_output_lens = calculate_metrics( - input_requests=input_requests, - outputs=outputs, - dur_s=benchmark_duration, - tokenizer=tokenizer, - selected_percentile_metrics=selected_percentile_metrics, - selected_percentiles=selected_percentiles, - goodput_config_dict=goodput_config_dict, - ) - - print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="=")) - print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) - if max_concurrency is not None: - print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency)) - if request_rate != float("inf"): - print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate)) - print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration)) - print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) - print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output)) - print( - "{:<40} {:<10.2f}".format( - "Request throughput (req/s):", metrics.request_throughput - ) - ) - if goodput_config_dict: - print( - "{:<40} {:<10.2f}".format( - "Request goodput (req/s):", metrics.request_goodput - ) - ) - print( - "{:<40} {:<10.2f}".format( - "Output token throughput (tok/s):", metrics.output_throughput - ) - ) - print( - "{:<40} {:<10.2f}".format( - "Total Token throughput (tok/s):", metrics.total_token_throughput - ) - ) - - result = { - "duration": benchmark_duration, - "completed": metrics.completed, - "total_input_tokens": metrics.total_input, - "total_output_tokens": metrics.total_output, - "request_throughput": metrics.request_throughput, - "request_goodput": metrics.request_goodput if goodput_config_dict else None, - "output_throughput": metrics.output_throughput, - "total_token_throughput": metrics.total_token_throughput, - "input_lens": [output.prompt_len for output in outputs], - "output_lens": actual_output_lens, - "ttfts": [output.ttft for output in outputs], - "itls": [output.itl for output in outputs], - "generated_texts": [output.generated_text for output in outputs], - "errors": [output.error for output in outputs], - } - - if rps_change_events: - result["rps_change_events"] = rps_change_events - - def process_one_metric( - # E.g., "ttft" - metric_attribute_name: str, - # E.g., "TTFT" - metric_name: str, - # E.g., "Time to First Token" - metric_header: str, - ): - # This function prints and adds statistics of the specified - # metric. - if metric_attribute_name not in selected_percentile_metrics: - return - print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-")) - print( - "{:<40} {:<10.2f}".format( - f"Mean {metric_name} (ms):", - getattr(metrics, f"mean_{metric_attribute_name}_ms"), - ) - ) - print( - "{:<40} {:<10.2f}".format( - f"Median {metric_name} (ms):", - getattr(metrics, f"median_{metric_attribute_name}_ms"), - ) - ) - result[f"mean_{metric_attribute_name}_ms"] = getattr( - metrics, f"mean_{metric_attribute_name}_ms" - ) - result[f"median_{metric_attribute_name}_ms"] = getattr( - metrics, f"median_{metric_attribute_name}_ms" - ) - result[f"std_{metric_attribute_name}_ms"] = getattr( - metrics, f"std_{metric_attribute_name}_ms" - ) - for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"): - p_word = str(int(p)) if int(p) == p else str(p) - print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value)) - result[f"p{p_word}_{metric_attribute_name}_ms"] = value - - process_one_metric("ttft", "TTFT", "Time to First Token") - process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)") - process_one_metric("itl", "ITL", "Inter-token Latency") - process_one_metric("e2el", "E2EL", "End-to-end Latency") - - print("=" * 50) - - if profile: - print("Stopping profiler...") - profile_input = RequestFuncInput( - model=model_id, - prompt=test_prompt, - api_url=base_url + "/stop_profile", - prompt_len=test_prompt_len, - output_len=test_output_len, - logprobs=logprobs, - ) - profile_output = await request_func(request_func_input=profile_input) - if profile_output.success: - print("Profiler stopped") - - return result - - -def check_goodput_args(args): - # Check and parse goodput arguments - goodput_config_dict = {} - VALID_NAMES = ["ttft", "tpot", "e2el"] - if args.goodput: - goodput_config_dict = parse_goodput(args.goodput) - for slo_name, slo_val in goodput_config_dict.items(): - if slo_name not in VALID_NAMES: - raise ValueError( - f"Invalid metric name found, {slo_name}: {slo_val}. " - "The service level objective name should be one of " - f"{str(VALID_NAMES)}. " - ) - if slo_val < 0: - raise ValueError( - f"Invalid value found, {slo_name}: {slo_val}. " - "The service level objective value should be " - "non-negative." - ) - return goodput_config_dict - - -def parse_goodput(slo_pairs): - goodput_config_dict = {} - try: - for slo_pair in slo_pairs: - slo_name, slo_val = slo_pair.split(":") - goodput_config_dict[slo_name] = float(slo_val) - except ValueError as err: - raise argparse.ArgumentTypeError( - "Invalid format found for service level objectives. " - 'Specify service level objectives for goodput as "KEY:VALUE" ' - "pairs, where the key is a metric name, and the value is a " - "number in milliseconds." - ) from err - return goodput_config_dict - - -def save_to_pytorch_benchmark_format( - args: argparse.Namespace, results: dict[str, Any], file_name: str -) -> None: - metrics = [ - "median_ttft_ms", - "mean_ttft_ms", - "std_ttft_ms", - "p99_ttft_ms", - "mean_tpot_ms", - "median_tpot_ms", - "std_tpot_ms", - "p99_tpot_ms", - "median_itl_ms", - "mean_itl_ms", - "std_itl_ms", - "p99_itl_ms", - ] - # These raw data might be useful, but they are rather big. They can be added - # later if needed - ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"] - pt_records = convert_to_pytorch_benchmark_format( - args=args, - metrics={k: [results[k]] for k in metrics}, - extra_info={ - k: results[k] - for k in results - if k not in metrics and k not in ignored_metrics - }, - ) - if pt_records: - # Don't use json suffix here as we don't want CI to pick it up - pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json" - write_to_json(pt_file, pt_records) - - -@deprecated( - "benchmark_serving.py is deprecated and will be removed in a future " - "version. Please use 'vllm bench serve' instead.", -) -def main(args: argparse.Namespace): - print(args) - random.seed(args.seed) - np.random.seed(args.seed) - - backend = args.backend - model_id = args.model - model_name = args.served_model_name - tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model - tokenizer_mode = args.tokenizer_mode - - # Validate ramp-up arguments - if args.ramp_up_strategy is not None: - if args.request_rate != float("inf"): - raise ValueError( - "When using ramp-up, do not specify --request-rate. " - "The request rate will be controlled by ramp-up parameters. " - "Please remove the --request-rate argument." - ) - if args.ramp_up_start_rps is None or args.ramp_up_end_rps is None: - raise ValueError( - "When using --ramp-up-strategy, both --ramp-up-start-rps and " - "--ramp-up-end-rps must be specified" - ) - if args.ramp_up_start_rps < 0 or args.ramp_up_end_rps < 0: - raise ValueError("Ramp-up start and end RPS must be non-negative") - if args.ramp_up_start_rps > args.ramp_up_end_rps: - raise ValueError("Ramp-up start RPS must be less than end RPS") - if args.ramp_up_strategy == "exponential" and args.ramp_up_start_rps == 0: - raise ValueError("For exponential ramp-up, the start RPS cannot be 0.") - - if args.base_url is not None: - api_url = f"{args.base_url}{args.endpoint}" - base_url = f"{args.base_url}" - else: - api_url = f"http://{args.host}:{args.port}{args.endpoint}" - base_url = f"http://{args.host}:{args.port}" - - tokenizer = get_tokenizer( - tokenizer_id, - tokenizer_mode=tokenizer_mode, - trust_remote_code=args.trust_remote_code, - ) - - if args.dataset_name is None: - raise ValueError( - "Please specify '--dataset-name' and the corresponding " - "'--dataset-path' if required." - ) - - if args.dataset_name == "custom": - dataset = CustomDataset(dataset_path=args.dataset_path) - input_requests = dataset.sample( - num_requests=args.num_prompts, - tokenizer=tokenizer, - output_len=args.custom_output_len, - skip_chat_template=args.custom_skip_chat_template, - request_id_prefix=args.request_id_prefix, - ) - - elif args.dataset_name == "sonnet": - dataset = SonnetDataset(dataset_path=args.dataset_path) - # For the "sonnet" dataset, formatting depends on the backend. - if args.backend == "openai-chat": - input_requests = dataset.sample( - num_requests=args.num_prompts, - input_len=args.sonnet_input_len, - output_len=args.sonnet_output_len, - prefix_len=args.sonnet_prefix_len, - tokenizer=tokenizer, - return_prompt_formatted=False, - request_id_prefix=args.request_id_prefix, - ) - else: - assert tokenizer.chat_template or tokenizer.default_chat_template, ( - "Tokenizer/model must have chat template for sonnet dataset." - ) - input_requests = dataset.sample( - num_requests=args.num_prompts, - input_len=args.sonnet_input_len, - output_len=args.sonnet_output_len, - prefix_len=args.sonnet_prefix_len, - tokenizer=tokenizer, - return_prompt_formatted=True, - request_id_prefix=args.request_id_prefix, - ) - - elif args.dataset_name == "hf": - # all following datasets are implemented from the - # HuggingFaceDataset base class - if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS: - dataset_class = VisionArenaDataset - args.hf_split = "train" - args.hf_subset = None - elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS: - dataset_class = InstructCoderDataset - args.hf_split = "train" - elif args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS: - dataset_class = MTBenchDataset - args.hf_split = "train" - elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS: - dataset_class = ConversationDataset - elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS: - dataset_class = AIMODataset - args.hf_split = "train" - elif args.dataset_path in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS: # noqa: E501 - dataset_class = NextEditPredictionDataset - args.hf_split = "train" - elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS: - dataset_class = ASRDataset - args.hf_split = "train" - else: - supported_datasets = set( - [ - dataset_name - for cls in HuggingFaceDataset.__subclasses__() - for dataset_name in cls.SUPPORTED_DATASET_PATHS - ] - ) - raise ValueError( - f"Unsupported dataset path: {args.dataset_path}. " - "Huggingface dataset only supports dataset_path" - f" from one of following: {supported_datasets}. " - "Please consider contributing if you would " - "like to add support for additional dataset formats." - ) - - if dataset_class.IS_MULTIMODAL and backend not in [ - "openai-chat", - "openai-audio", - ]: - # multi-modal benchmark is only available on OpenAI Chat backend. - raise ValueError( - "Multi-modal content is only supported on 'openai-chat' and " - "'openai-audio' backend." - ) - input_requests = dataset_class( - dataset_path=args.dataset_path, - dataset_subset=args.hf_subset, - dataset_split=args.hf_split, - random_seed=args.seed, - no_stream=args.no_stream, - ).sample( - num_requests=args.num_prompts, - tokenizer=tokenizer, - output_len=args.hf_output_len, - request_id_prefix=args.request_id_prefix, - ) - - else: - # For datasets that follow a similar structure, use a mapping. - dataset_mapping = { - "sharegpt": lambda: ShareGPTDataset( - random_seed=args.seed, dataset_path=args.dataset_path - ).sample( - tokenizer=tokenizer, - num_requests=args.num_prompts, - output_len=args.sharegpt_output_len, - request_id_prefix=args.request_id_prefix, - ), - "burstgpt": lambda: BurstGPTDataset( - random_seed=args.seed, dataset_path=args.dataset_path - ).sample( - tokenizer=tokenizer, - num_requests=args.num_prompts, - request_id_prefix=args.request_id_prefix, - ), - "random": lambda: RandomDataset(dataset_path=args.dataset_path).sample( - tokenizer=tokenizer, - num_requests=args.num_prompts, - prefix_len=args.random_prefix_len, - input_len=args.random_input_len, - output_len=args.random_output_len, - range_ratio=args.random_range_ratio, - request_id_prefix=args.request_id_prefix, - ), - } - - try: - input_requests = dataset_mapping[args.dataset_name]() - except KeyError as err: - raise ValueError(f"Unknown dataset: {args.dataset_name}") from err - goodput_config_dict = check_goodput_args(args) - - # Collect the sampling parameters. - sampling_params = { - k: v - for k, v in { - "top_p": args.top_p, - "top_k": args.top_k, - "min_p": args.min_p, - "temperature": args.temperature, - }.items() - if v is not None - } - - # Sampling parameters are only supported by openai-compatible backend. - if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS: - raise ValueError( - "Sampling parameters are only supported by openai-compatible backends." - ) - - if "temperature" not in sampling_params: - sampling_params["temperature"] = 0.0 # Default to greedy decoding. - - if args.backend == "llama.cpp": - # Disable prompt caching in llama.cpp backend - sampling_params["cache_prompt"] = False - - # Avoid GC processing "static" data - reduce pause times. - gc.collect() - gc.freeze() - - benchmark_result = asyncio.run( - benchmark( - backend=backend, - api_url=api_url, - base_url=base_url, - model_id=model_id, - model_name=model_name, - tokenizer=tokenizer, - input_requests=input_requests, - logprobs=args.logprobs, - request_rate=args.request_rate, - burstiness=args.burstiness, - disable_tqdm=args.disable_tqdm, - profile=args.profile, - selected_percentile_metrics=args.percentile_metrics.split(","), - selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")], - ignore_eos=args.ignore_eos, - goodput_config_dict=goodput_config_dict, - max_concurrency=args.max_concurrency, - lora_modules=args.lora_modules, - extra_body=sampling_params, - ramp_up_strategy=args.ramp_up_strategy, - ramp_up_start_rps=args.ramp_up_start_rps, - ramp_up_end_rps=args.ramp_up_end_rps, - ) - ) - - # Save config and results to json - if args.save_result or args.append_result: - result_json: dict[str, Any] = {} - - # Setup - current_dt = datetime.now().strftime("%Y%m%d-%H%M%S") - result_json["date"] = current_dt - result_json["backend"] = backend - result_json["model_id"] = model_id - result_json["tokenizer_id"] = tokenizer_id - result_json["num_prompts"] = args.num_prompts - - # Metadata - if args.metadata: - for item in args.metadata: - if "=" in item: - kvstring = item.split("=") - result_json[kvstring[0].strip()] = kvstring[1].strip() - else: - raise ValueError( - "Invalid metadata format. Please use KEY=VALUE format." - ) - # Traffic - result_json["request_rate"] = ( - args.request_rate if args.request_rate < float("inf") else "inf" - ) - result_json["burstiness"] = args.burstiness - result_json["max_concurrency"] = args.max_concurrency - - if args.ramp_up_strategy is not None: - result_json["ramp_up_strategy"] = args.ramp_up_strategy - result_json["ramp_up_start_rps"] = args.ramp_up_start_rps - result_json["ramp_up_end_rps"] = args.ramp_up_end_rps - - # Merge with benchmark result - result_json = {**result_json, **benchmark_result} - - if not args.save_detailed: - # Remove fields with too many data points - for field in [ - "input_lens", - "output_lens", - "ttfts", - "itls", - "generated_texts", - "errors", - ]: - if field in result_json: - del result_json[field] - if field in benchmark_result: - del benchmark_result[field] - - # Save to file - base_model_id = model_id.split("/")[-1] - max_concurrency_str = ( - f"-concurrency{args.max_concurrency}" - if args.max_concurrency is not None - else "" - ) - if args.ramp_up_strategy is not None: - file_name = f"{backend}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa - else: - file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa - if args.result_filename: - file_name = args.result_filename - if args.result_dir: - os.makedirs(args.result_dir, exist_ok=True) - file_name = os.path.join(args.result_dir, file_name) - with open( - file_name, mode="a+" if args.append_result else "w", encoding="utf-8" - ) as outfile: - # Append a newline. - if args.append_result and outfile.tell() != 0: - outfile.write("\n") - json.dump(result_json, outfile) - save_to_pytorch_benchmark_format(args, result_json, file_name) - - -def create_argument_parser(): - parser = FlexibleArgumentParser( - description="Benchmark the online serving throughput." - ) - parser.add_argument( - "--backend", - type=str, - default="vllm", - choices=list(ASYNC_REQUEST_FUNCS.keys()), - ) - parser.add_argument( - "--base-url", - type=str, - default=None, - help="Server or API base url if not using http host and port.", - ) - # Use 127.0.0.1 here instead of localhost to force the use of ipv4 - parser.add_argument("--host", type=str, default="127.0.0.1") - parser.add_argument("--port", type=int, default=8000) - parser.add_argument( - "--endpoint", - type=str, - default="/v1/completions", - help="API endpoint.", - ) - parser.add_argument( - "--dataset-name", - type=str, - default="sharegpt", - choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"], - help="Name of the dataset to benchmark on.", - ) - parser.add_argument( - "--dataset-path", - type=str, - default=None, - help="Path to the sharegpt/sonnet dataset. " - "Or the huggingface dataset ID if using HF dataset.", - ) - parser.add_argument( - "--no-stream", - action="store_true", - help="Do not load the dataset in streaming mode.", - ) - parser.add_argument( - "--max-concurrency", - type=int, - default=None, - help="Maximum number of concurrent requests. This can be used " - "to help simulate an environment where a higher level component " - "is enforcing a maximum number of concurrent requests. While the " - "--request-rate argument controls the rate at which requests are " - "initiated, this argument will control how many are actually allowed " - "to execute at a time. This means that when used in combination, the " - "actual request rate may be lower than specified with --request-rate, " - "if the server is not processing requests fast enough to keep up.", - ) - - parser.add_argument( - "--model", - type=str, - required=True, - help="Name of the model.", - ) - parser.add_argument( - "--tokenizer", - type=str, - help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 - ) - parser.add_argument("--use-beam-search", action="store_true") - parser.add_argument( - "--num-prompts", - type=int, - default=1000, - help="Number of prompts to process.", - ) - parser.add_argument( - "--logprobs", - type=int, - default=None, - help=( - "Number of logprobs-per-token to compute & return as part of " - "the request. If unspecified, then either (1) if beam search " - "is disabled, no logprobs are computed & a single dummy " - "logprob is returned for each token; or (2) if beam search " - "is enabled 1 logprob per token is computed" - ), - ) - parser.add_argument( - "--request-rate", - type=float, - default=float("inf"), - help="Number of requests per second. If this is inf, " - "then all the requests are sent at time 0. " - "Otherwise, we use Poisson process or gamma distribution " - "to synthesize the request arrival times.", - ) - parser.add_argument( - "--burstiness", - type=float, - default=1.0, - help="Burstiness factor of the request generation. " - "Only take effect when request_rate is not inf. " - "Default value is 1, which follows Poisson process. " - "Otherwise, the request intervals follow a gamma distribution. " - "A lower burstiness value (0 < burstiness < 1) results in more " - "bursty requests. A higher burstiness value (burstiness > 1) " - "results in a more uniform arrival of requests.", - ) - parser.add_argument("--seed", type=int, default=0) - parser.add_argument( - "--trust-remote-code", - action="store_true", - help="Trust remote code from huggingface", - ) - parser.add_argument( - "--disable-tqdm", - action="store_true", - help="Specify to disable tqdm progress bar.", - ) - parser.add_argument( - "--profile", - action="store_true", - help="Use Torch Profiler. The endpoint must be launched with " - "VLLM_TORCH_PROFILER_DIR to enable profiler.", - ) - parser.add_argument( - "--save-result", - action="store_true", - help="Specify to save benchmark results to a json file", - ) - parser.add_argument( - "--save-detailed", - action="store_true", - help="When saving the results, whether to include per request " - "information such as response, error, ttfs, tpots, etc.", - ) - parser.add_argument( - "--append-result", - action="store_true", - help="Append the benchmark result to the existing json file.", - ) - parser.add_argument( - "--metadata", - metavar="KEY=VALUE", - nargs="*", - help="Key-value pairs (e.g, --metadata version=0.3.3 tp=1) " - "for metadata of this run to be saved in the result JSON file " - "for record keeping purposes.", - ) - parser.add_argument( - "--result-dir", - type=str, - default=None, - help="Specify directory to save benchmark json results." - "If not specified, results are saved in the current directory.", - ) - parser.add_argument( - "--result-filename", - type=str, - default=None, - help="Specify the filename to save benchmark json results." - "If not specified, results will be saved in " - "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" - " format.", - ) - parser.add_argument( - "--ignore-eos", - action="store_true", - help="Set ignore_eos flag when sending the benchmark request." - "Warning: ignore_eos is not supported in deepspeed_mii and tgi.", - ) - parser.add_argument( - "--percentile-metrics", - type=str, - default="ttft,tpot,itl", - help="Comma-separated list of selected metrics to report percentils. " - "This argument specifies the metrics to report percentiles. " - 'Allowed metric names are "ttft", "tpot", "itl", "e2el". ' - 'Default value is "ttft,tpot,itl".', - ) - parser.add_argument( - "--metric-percentiles", - type=str, - default="99", - help="Comma-separated list of percentiles for selected metrics. " - 'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". ' - 'Default value is "99". ' - 'Use "--percentile-metrics" to select metrics.', - ) - parser.add_argument( - "--goodput", - nargs="+", - required=False, - help='Specify service level objectives for goodput as "KEY:VALUE" ' - "pairs, where the key is a metric name, and the value is in " - 'milliseconds. Multiple "KEY:VALUE" pairs can be provided, ' - "separated by spaces. Allowed request level metric names are " - '"ttft", "tpot", "e2el". For more context on the definition of ' - "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 " - "and the blog: https://hao-ai-lab.github.io/blogs/distserve", - ) - parser.add_argument( - "--request-id-prefix", - type=str, - required=False, - default="benchmark-serving", - help="Specify the prefix of request id.", - ) - - # group for dataset specific arguments - custom_group = parser.add_argument_group("custom dataset options") - custom_group.add_argument( - "--custom-output-len", - type=int, - default=256, - help="Number of output tokens per request, used only for custom dataset.", - ) - custom_group.add_argument( - "--custom-skip-chat-template", - action="store_true", - help="Skip applying chat template to prompt, used only for custom dataset.", - ) - - sonnet_group = parser.add_argument_group("sonnet dataset options") - sonnet_group.add_argument( - "--sonnet-input-len", - type=int, - default=550, - help="Number of input tokens per request, used only for sonnet dataset.", - ) - sonnet_group.add_argument( - "--sonnet-output-len", - type=int, - default=150, - help="Number of output tokens per request, used only for sonnet dataset.", - ) - sonnet_group.add_argument( - "--sonnet-prefix-len", - type=int, - default=200, - help="Number of prefix tokens per request, used only for sonnet dataset.", - ) - - sharegpt_group = parser.add_argument_group("sharegpt dataset options") - sharegpt_group.add_argument( - "--sharegpt-output-len", - type=int, - default=None, - help="Output length for each request. Overrides the output length " - "from the ShareGPT dataset.", - ) - - random_group = parser.add_argument_group("random dataset options") - random_group.add_argument( - "--random-input-len", - type=int, - default=1024, - help="Number of input tokens per request, used only for random sampling.", - ) - random_group.add_argument( - "--random-output-len", - type=int, - default=128, - help="Number of output tokens per request, used only for random sampling.", - ) - random_group.add_argument( - "--random-range-ratio", - type=float, - default=0.0, - help="Range ratio for sampling input/output length, " - "used only for random sampling. Must be in the range [0, 1) to define " - "a symmetric sampling range" - "[length * (1 - range_ratio), length * (1 + range_ratio)].", - ) - random_group.add_argument( - "--random-prefix-len", - type=int, - default=0, - help=( - "Number of fixed prefix tokens before the random context " - "in a request. " - "The total input length is the sum of `random-prefix-len` and " - "a random " - "context length sampled from [input_len * (1 - range_ratio), " - "input_len * (1 + range_ratio)]." - ), - ) - - hf_group = parser.add_argument_group("hf dataset options") - hf_group.add_argument( - "--hf-subset", type=str, default=None, help="Subset of the HF dataset." - ) - hf_group.add_argument( - "--hf-split", type=str, default=None, help="Split of the HF dataset." - ) - hf_group.add_argument( - "--hf-output-len", - type=int, - default=None, - help="Output length for each request. Overrides the output lengths " - "from the sampled HF dataset.", - ) - - sampling_group = parser.add_argument_group("sampling parameters") - sampling_group.add_argument( - "--top-p", - type=float, - default=None, - help="Top-p sampling parameter. Only has effect on openai-compatible backends.", - ) - sampling_group.add_argument( - "--top-k", - type=int, - default=None, - help="Top-k sampling parameter. Only has effect on openai-compatible backends.", - ) - sampling_group.add_argument( - "--min-p", - type=float, - default=None, - help="Min-p sampling parameter. Only has effect on openai-compatible backends.", - ) - sampling_group.add_argument( - "--temperature", - type=float, - default=None, - help="Temperature sampling parameter. Only has effect on " - "openai-compatible backends. If not specified, default to greedy " - "decoding (i.e. temperature==0.0).", - ) - - parser.add_argument( - "--tokenizer-mode", - type=str, - default="auto", - choices=["auto", "slow", "mistral", "custom"], - help='The tokenizer mode.\n\n* "auto" will use the ' - 'fast tokenizer if available.\n* "slow" will ' - "always use the slow tokenizer. \n* " - '"mistral" will always use the `mistral_common` tokenizer. \n*' - '"custom" will use --tokenizer to select the preregistered tokenizer.', - ) - - parser.add_argument( - "--served-model-name", - type=str, - default=None, - help="The model name used in the API. " - "If not specified, the model name will be the " - "same as the ``--model`` argument. ", - ) - - parser.add_argument( - "--lora-modules", - nargs="+", - default=None, - help="A subset of LoRA module names passed in when " - "launching the server. For each request, the " - "script chooses a LoRA module at random.", - ) - - parser.add_argument( - "--ramp-up-strategy", - type=str, - default=None, - choices=["linear", "exponential"], - help="The ramp-up strategy. This would be used to " - "ramp up the request rate from initial RPS to final " - "RPS rate (specified by --ramp-up-start-rps and --ramp-up-end-rps). " - "over the duration of the benchmark.", - ) - parser.add_argument( - "--ramp-up-start-rps", - type=int, - default=None, - help="The starting request rate for ramp-up (RPS). " - "Needs to be specified when --ramp-up-strategy is used.", - ) - parser.add_argument( - "--ramp-up-end-rps", - type=int, - default=None, - help="The ending request rate for ramp-up (RPS). " - "Needs to be specified when --ramp-up-strategy is used.", - ) - - return parser - +import sys if __name__ == "__main__": - parser = create_argument_parser() - args = parser.parse_args() - main(args) + print("""DEPRECATED: This script has been moved to the vLLM CLI. + +Please use the following command instead: + vllm bench serve + +For help with the new command, run: + vllm bench serve --help + +Alternatively, you can run the new command directly with: + python -m vllm.entrypoints.cli.main bench serve --help +""") + sys.exit(1) diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index ca6843a72..4aae755eb 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -998,7 +998,7 @@ def create_argument_parser(): "--percentile-metrics", type=str, default="ttft,tpot,itl", - help="Comma-separated list of selected metrics to report percentils. " + help="Comma-separated list of selected metrics to report percentiles. " "This argument specifies the metrics to report percentiles. " 'Allowed metric names are "ttft", "tpot", "itl", "e2el". ' 'Default value is "ttft,tpot,itl".', diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 6b24b8c8f..b6dc0918f 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -1,741 +1,17 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Benchmark offline inference throughput.""" - -import argparse -import dataclasses -import json -import os -import random -import time -import warnings -from typing import Any, Optional, Union - -import torch -import uvloop -from tqdm import tqdm -from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase -from typing_extensions import deprecated - -from benchmark_dataset import ( - AIMODataset, - BurstGPTDataset, - ConversationDataset, - InstructCoderDataset, - RandomDataset, - SampleRequest, - ShareGPTDataset, - SonnetDataset, - VisionArenaDataset, -) -from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json -from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs -from vllm.entrypoints.openai.api_server import ( - build_async_engine_client_from_engine_args, -) -from vllm.inputs import TextPrompt, TokensPrompt -from vllm.lora.request import LoRARequest -from vllm.outputs import RequestOutput -from vllm.sampling_params import BeamSearchParams -from vllm.utils import FlexibleArgumentParser, merge_async_iterators - - -def run_vllm( - requests: list[SampleRequest], - n: int, - engine_args: EngineArgs, - disable_detokenize: bool = False, -) -> tuple[float, Optional[list[RequestOutput]]]: - from vllm import LLM, SamplingParams - - llm = LLM(**dataclasses.asdict(engine_args)) - assert all( - llm.llm_engine.model_config.max_model_len - >= (request.prompt_len + request.expected_output_len) - for request in requests - ), ( - "Please ensure that max_model_len is greater than the sum of" - " prompt_len and expected_output_len for all requests." - ) - # Add the requests to the engine. - prompts: list[Union[TextPrompt, TokensPrompt]] = [] - sampling_params: list[SamplingParams] = [] - for request in requests: - prompts.append( - TokensPrompt( - prompt_token_ids=request.prompt["prompt_token_ids"], - multi_modal_data=request.multi_modal_data, - ) - if "prompt_token_ids" in request.prompt - else TextPrompt( - prompt=request.prompt, multi_modal_data=request.multi_modal_data - ) - ) - sampling_params.append( - SamplingParams( - n=n, - temperature=1.0, - top_p=1.0, - ignore_eos=True, - max_tokens=request.expected_output_len, - detokenize=not disable_detokenize, - ) - ) - lora_requests: Optional[list[LoRARequest]] = None - if engine_args.enable_lora: - lora_requests = [request.lora_request for request in requests] - - use_beam_search = False - - outputs = None - if not use_beam_search: - start = time.perf_counter() - outputs = llm.generate( - prompts, sampling_params, lora_request=lora_requests, use_tqdm=True - ) - end = time.perf_counter() - else: - assert lora_requests is None, "BeamSearch API does not support LoRA" - # output_len should be the same for all requests. - output_len = requests[0].expected_output_len - for request in requests: - assert request.expected_output_len == output_len - start = time.perf_counter() - llm.beam_search( - prompts, - BeamSearchParams( - beam_width=n, - max_tokens=output_len, - ignore_eos=True, - ), - ) - end = time.perf_counter() - return end - start, outputs - - -def run_vllm_chat( - requests: list[SampleRequest], - n: int, - engine_args: EngineArgs, - disable_detokenize: bool = False, -) -> tuple[float, list[RequestOutput]]: - """ - Run vLLM chat benchmark. This function is recommended ONLY for benchmarking - multimodal models as it properly handles multimodal inputs and chat - formatting. For non-multimodal models, use run_vllm() instead. - """ - from vllm import LLM, SamplingParams - - llm = LLM(**dataclasses.asdict(engine_args)) - - assert all( - llm.llm_engine.model_config.max_model_len - >= (request.prompt_len + request.expected_output_len) - for request in requests - ), ( - "Please ensure that max_model_len is greater than the sum of " - "prompt_len and expected_output_len for all requests." - ) - - prompts = [] - sampling_params: list[SamplingParams] = [] - for request in requests: - prompts.append(request.prompt) - sampling_params.append( - SamplingParams( - n=n, - temperature=1.0, - top_p=1.0, - ignore_eos=True, - max_tokens=request.expected_output_len, - detokenize=not disable_detokenize, - ) - ) - start = time.perf_counter() - outputs = llm.chat(prompts, sampling_params, use_tqdm=True) - end = time.perf_counter() - return end - start, outputs - - -async def run_vllm_async( - requests: list[SampleRequest], - n: int, - engine_args: AsyncEngineArgs, - disable_frontend_multiprocessing: bool = False, - disable_detokenize: bool = False, -) -> float: - from vllm import SamplingParams - - async with build_async_engine_client_from_engine_args( - engine_args, - disable_frontend_multiprocessing=disable_frontend_multiprocessing, - ) as llm: - model_config = await llm.get_model_config() - assert all( - model_config.max_model_len - >= (request.prompt_len + request.expected_output_len) - for request in requests - ), ( - "Please ensure that max_model_len is greater than the sum of" - " prompt_len and expected_output_len for all requests." - ) - - # Add the requests to the engine. - prompts: list[Union[TextPrompt, TokensPrompt]] = [] - sampling_params: list[SamplingParams] = [] - lora_requests: list[Optional[LoRARequest]] = [] - for request in requests: - prompts.append( - TokensPrompt( - prompt_token_ids=request.prompt["prompt_token_ids"], - multi_modal_data=request.multi_modal_data, - ) - if "prompt_token_ids" in request.prompt - else TextPrompt( - prompt=request.prompt, multi_modal_data=request.multi_modal_data - ) - ) - sampling_params.append( - SamplingParams( - n=n, - temperature=1.0, - top_p=1.0, - ignore_eos=True, - max_tokens=request.expected_output_len, - detokenize=not disable_detokenize, - ) - ) - lora_requests.append(request.lora_request) - - generators = [] - start = time.perf_counter() - for i, (prompt, sp, lr) in enumerate( - zip(prompts, sampling_params, lora_requests) - ): - generator = llm.generate(prompt, sp, lora_request=lr, request_id=f"test{i}") - generators.append(generator) - all_gens = merge_async_iterators(*generators) - async for i, res in all_gens: - pass - end = time.perf_counter() - return end - start - - -def run_hf( - requests: list[SampleRequest], - model: str, - tokenizer: PreTrainedTokenizerBase, - n: int, - max_batch_size: int, - trust_remote_code: bool, - disable_detokenize: bool = False, -) -> float: - llm = AutoModelForCausalLM.from_pretrained( - model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code - ) - if llm.config.model_type == "llama": - # To enable padding in the HF backend. - tokenizer.pad_token = tokenizer.eos_token - llm = llm.cuda() - - pbar = tqdm(total=len(requests)) - start = time.perf_counter() - batch: list[str] = [] - max_prompt_len = 0 - max_output_len = 0 - for i in range(len(requests)): - prompt = requests[i].prompt - prompt_len = requests[i].prompt_len - output_len = requests[i].expected_output_len - # Add the prompt to the batch. - batch.append(prompt) - max_prompt_len = max(max_prompt_len, prompt_len) - max_output_len = max(max_output_len, output_len) - if len(batch) < max_batch_size and i != len(requests) - 1: - # Check if we can add more requests to the batch. - next_prompt_len = requests[i + 1].prompt_len - next_output_len = requests[i + 1].expected_output_len - if ( - max(max_prompt_len, next_prompt_len) - + max(max_output_len, next_output_len) - ) <= 2048: - # We can add more requests to the batch. - continue - - # Generate the sequences. - input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids - llm_outputs = llm.generate( - input_ids=input_ids.cuda(), - do_sample=True, - num_return_sequences=n, - temperature=1.0, - top_p=1.0, - use_cache=True, - max_new_tokens=max_output_len, - ) - if not disable_detokenize: - # Include the decoding time. - tokenizer.batch_decode(llm_outputs, skip_special_tokens=True) - pbar.update(len(batch)) - - # Clear the batch. - batch = [] - max_prompt_len = 0 - max_output_len = 0 - end = time.perf_counter() - return end - start - - -def run_mii( - requests: list[SampleRequest], - model: str, - tensor_parallel_size: int, - output_len: int, -) -> float: - from mii import client, serve - - llm = serve(model, tensor_parallel=tensor_parallel_size) - prompts = [request.prompt for request in requests] - - start = time.perf_counter() - llm.generate(prompts, max_new_tokens=output_len) - end = time.perf_counter() - client = client(model) - client.terminate_server() - return end - start - - -def save_to_pytorch_benchmark_format( - args: argparse.Namespace, results: dict[str, Any] -) -> None: - pt_records = convert_to_pytorch_benchmark_format( - args=args, - metrics={ - "requests_per_second": [results["requests_per_second"]], - "tokens_per_second": [results["tokens_per_second"]], - }, - extra_info={ - k: results[k] for k in ["elapsed_time", "num_requests", "total_num_tokens"] - }, - ) - if pt_records: - # Don't use json suffix here as we don't want CI to pick it up - pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json" - write_to_json(pt_file, pt_records) - - -def get_requests(args, tokenizer): - # Common parameters for all dataset types. - common_kwargs = { - "dataset_path": args.dataset_path, - "random_seed": args.seed, - } - sample_kwargs = { - "tokenizer": tokenizer, - "lora_path": args.lora_path, - "max_loras": args.max_loras, - "num_requests": args.num_prompts, - "input_len": args.input_len, - "output_len": args.output_len, - } - - if args.dataset_path is None or args.dataset_name == "random": - sample_kwargs["range_ratio"] = args.random_range_ratio - sample_kwargs["prefix_len"] = args.prefix_len - dataset_cls = RandomDataset - elif args.dataset_name == "sharegpt": - dataset_cls = ShareGPTDataset - if args.backend == "vllm-chat": - sample_kwargs["enable_multimodal_chat"] = True - elif args.dataset_name == "sonnet": - assert tokenizer.chat_template or tokenizer.default_chat_template, ( - "Tokenizer/model must have chat template for sonnet dataset." - ) - dataset_cls = SonnetDataset - sample_kwargs["prefix_len"] = args.prefix_len - sample_kwargs["return_prompt_formatted"] = True - elif args.dataset_name == "burstgpt": - dataset_cls = BurstGPTDataset - elif args.dataset_name == "hf": - common_kwargs["no_stream"] = args.no_stream - if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS: - dataset_cls = VisionArenaDataset - common_kwargs["dataset_subset"] = None - common_kwargs["dataset_split"] = "train" - sample_kwargs["enable_multimodal_chat"] = True - elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS: - dataset_cls = InstructCoderDataset - common_kwargs["dataset_split"] = "train" - elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS: - dataset_cls = ConversationDataset - common_kwargs["dataset_subset"] = args.hf_subset - common_kwargs["dataset_split"] = args.hf_split - sample_kwargs["enable_multimodal_chat"] = True - elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS: - dataset_cls = AIMODataset - common_kwargs["dataset_subset"] = None - common_kwargs["dataset_split"] = "train" - else: - raise ValueError(f"Unknown dataset name: {args.dataset_name}") - # Remove None values - sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None} - return dataset_cls(**common_kwargs).sample(**sample_kwargs) - - -@deprecated( - "benchmark_throughput.py is deprecated and will be removed in a " - "future version. Please use 'vllm bench throughput' instead.", -) -def main(args: argparse.Namespace): - if args.seed is None: - args.seed = 0 - print(args) - random.seed(args.seed) - # Sample the requests. - tokenizer = AutoTokenizer.from_pretrained( - args.tokenizer, trust_remote_code=args.trust_remote_code - ) - requests = get_requests(args, tokenizer) - is_multi_modal = any(request.multi_modal_data is not None for request in requests) - request_outputs: Optional[list[RequestOutput]] = None - if args.backend == "vllm": - if args.async_engine: - elapsed_time = uvloop.run( - run_vllm_async( - requests, - args.n, - AsyncEngineArgs.from_cli_args(args), - args.disable_frontend_multiprocessing, - args.disable_detokenize, - ) - ) - else: - elapsed_time, request_outputs = run_vllm( - requests, - args.n, - EngineArgs.from_cli_args(args), - args.disable_detokenize, - ) - elif args.backend == "hf": - assert args.tensor_parallel_size == 1 - elapsed_time = run_hf( - requests, - args.model, - tokenizer, - args.n, - args.hf_max_batch_size, - args.trust_remote_code, - args.disable_detokenize, - ) - elif args.backend == "mii": - elapsed_time = run_mii( - requests, args.model, args.tensor_parallel_size, args.output_len - ) - elif args.backend == "vllm-chat": - elapsed_time, request_outputs = run_vllm_chat( - requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize - ) - else: - raise ValueError(f"Unknown backend: {args.backend}") - - if request_outputs: - # Note: with the vllm and vllm-chat backends, - # we have request_outputs, which we use to count tokens. - total_prompt_tokens = 0 - total_output_tokens = 0 - for ro in request_outputs: - if not isinstance(ro, RequestOutput): - continue - total_prompt_tokens += ( - len(ro.prompt_token_ids) if ro.prompt_token_ids else 0 - ) - total_output_tokens += sum(len(o.token_ids) for o in ro.outputs if o) - total_num_tokens = total_prompt_tokens + total_output_tokens - else: - total_num_tokens = sum(r.prompt_len + r.expected_output_len for r in requests) - total_output_tokens = sum(r.expected_output_len for r in requests) - total_prompt_tokens = total_num_tokens - total_output_tokens - - if is_multi_modal and args.backend != "vllm-chat": - print( - "\033[91mWARNING\033[0m: Multi-modal request with " - f"{args.backend} backend detected. The " - "following metrics are not accurate because image tokens are not" - " counted. See vllm-project/vllm/issues/9778 for details." - ) - # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length. - # vllm-chat backend counts the image tokens now - - print( - f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " - f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " - f"{total_output_tokens / elapsed_time:.2f} output tokens/s" - ) - print(f"Total num prompt tokens: {total_prompt_tokens}") - print(f"Total num output tokens: {total_output_tokens}") - - # Output JSON results if specified - if args.output_json: - results = { - "elapsed_time": elapsed_time, - "num_requests": len(requests), - "total_num_tokens": total_num_tokens, - "requests_per_second": len(requests) / elapsed_time, - "tokens_per_second": total_num_tokens / elapsed_time, - } - with open(args.output_json, "w") as f: - json.dump(results, f, indent=4) - save_to_pytorch_benchmark_format(args, results) - - -def validate_args(args): - """ - Validate command-line arguments. - """ - - # === Deprecation and Defaulting === - if args.dataset is not None: - warnings.warn( - "The '--dataset' argument will be deprecated in the next release. " - "Please use '--dataset-name' and '--dataset-path' instead.", - stacklevel=2, - ) - args.dataset_path = args.dataset - - if not getattr(args, "tokenizer", None): - args.tokenizer = args.model - - # === Backend Validation === - valid_backends = {"vllm", "hf", "mii", "vllm-chat"} - if args.backend not in valid_backends: - raise ValueError(f"Unsupported backend: {args.backend}") - - # === Dataset Configuration === - if not args.dataset and not args.dataset_path: - print("When dataset path is not set, it will default to random dataset") - args.dataset_name = "random" - if args.input_len is None: - raise ValueError("input_len must be provided for a random dataset") - - # === Dataset Name Specific Checks === - # --hf-subset and --hf-split: only used - # when dataset_name is 'hf' - if args.dataset_name != "hf" and ( - getattr(args, "hf_subset", None) is not None - or getattr(args, "hf_split", None) is not None - ): - warnings.warn( - "--hf-subset and --hf-split will be ignored \ - since --dataset-name is not 'hf'.", - stacklevel=2, - ) - elif args.dataset_name == "hf": - if args.dataset_path in ( - VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys() - | ConversationDataset.SUPPORTED_DATASET_PATHS - ): - assert args.backend == "vllm-chat", ( - f"{args.dataset_path} needs to use vllm-chat as the backend." - ) # noqa: E501 - elif args.dataset_path in ( - InstructCoderDataset.SUPPORTED_DATASET_PATHS - | AIMODataset.SUPPORTED_DATASET_PATHS - ): - assert args.backend == "vllm", ( - f"{args.dataset_path} needs to use vllm as the backend." - ) # noqa: E501 - else: - raise ValueError(f"{args.dataset_path} is not supported by hf dataset.") - - # --random-range-ratio: only used when dataset_name is 'random' - if args.dataset_name != "random" and args.random_range_ratio is not None: - warnings.warn( - "--random-range-ratio will be ignored since \ - --dataset-name is not 'random'.", - stacklevel=2, - ) - - # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not - # set. - if ( - args.dataset_name not in {"random", "sonnet", None} - and args.prefix_len is not None - ): - warnings.warn( - "--prefix-len will be ignored since --dataset-name\ - is not 'random', 'sonnet', or not set.", - stacklevel=2, - ) - - # === LoRA Settings === - if getattr(args, "enable_lora", False) and args.backend != "vllm": - raise ValueError("LoRA benchmarking is only supported for vLLM backend") - if getattr(args, "enable_lora", False) and args.lora_path is None: - raise ValueError("LoRA path must be provided when enable_lora is True") - - # === Backend-specific Validations === - if args.backend == "hf" and args.hf_max_batch_size is None: - raise ValueError("HF max batch size is required for HF backend") - if args.backend != "hf" and args.hf_max_batch_size is not None: - raise ValueError("HF max batch size is only for HF backend.") - - if ( - args.backend in {"hf", "mii"} - and getattr(args, "quantization", None) is not None - ): - raise ValueError("Quantization is only for vLLM backend.") - - if args.backend == "mii" and args.dtype != "auto": - raise ValueError("dtype must be auto for MII backend.") - if args.backend == "mii" and args.n != 1: - raise ValueError("n must be 1 for MII backend.") - if args.backend == "mii" and args.tokenizer != args.model: - raise ValueError("Tokenizer must be the same as the model for MII backend.") - - # --data-parallel is not supported currently. - # https://github.com/vllm-project/vllm/issues/16222 - if args.data_parallel_size > 1: - raise ValueError( - "Data parallel is not supported in offline benchmark, " - "please use benchmark serving instead" - ) - - -def create_argument_parser(): - parser = FlexibleArgumentParser(description="Benchmark the throughput.") - parser.add_argument( - "--backend", - type=str, - choices=["vllm", "hf", "mii", "vllm-chat"], - default="vllm", - ) - parser.add_argument( - "--dataset-name", - type=str, - choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"], - help="Name of the dataset to benchmark on.", - default="sharegpt", - ) - parser.add_argument( - "--no-stream", - action="store_true", - help="Do not load the dataset in streaming mode.", - ) - parser.add_argument( - "--dataset", - type=str, - default=None, - help="Path to the ShareGPT dataset, will be deprecated in\ - the next release. The dataset is expected to " - "be a json in form of list[dict[..., conversations: " - "list[dict[..., value: ]]]]", - ) - parser.add_argument( - "--dataset-path", type=str, default=None, help="Path to the dataset" - ) - parser.add_argument( - "--input-len", - type=int, - default=None, - help="Input prompt length for each request", - ) - parser.add_argument( - "--output-len", - type=int, - default=None, - help="Output length for each request. Overrides the " - "output length from the dataset.", - ) - parser.add_argument( - "--n", type=int, default=1, help="Number of generated sequences per prompt." - ) - parser.add_argument( - "--num-prompts", type=int, default=1000, help="Number of prompts to process." - ) - parser.add_argument( - "--hf-max-batch-size", - type=int, - default=None, - help="Maximum batch size for HF backend.", - ) - parser.add_argument( - "--output-json", - type=str, - default=None, - help="Path to save the throughput results in JSON format.", - ) - parser.add_argument( - "--async-engine", - action="store_true", - default=False, - help="Use vLLM async engine rather than LLM class.", - ) - parser.add_argument( - "--disable-frontend-multiprocessing", - action="store_true", - default=False, - help="Disable decoupled async engine frontend.", - ) - parser.add_argument( - "--disable-detokenize", - action="store_true", - help=( - "Do not detokenize the response (i.e. do not include " - "detokenization time in the measurement)" - ), - ) - # LoRA - parser.add_argument( - "--lora-path", - type=str, - default=None, - help="Path to the LoRA adapters to use. This can be an absolute path, " - "a relative path, or a Hugging Face model identifier.", - ) - parser.add_argument( - "--prefix-len", - type=int, - default=None, - help=f"Number of prefix tokens to be used in RandomDataset " - "and SonnetDataset. For RandomDataset, the total input " - "length is the sum of prefix-len (default: " - f"{RandomDataset.DEFAULT_PREFIX_LEN}) and a random context length " - "sampled from [input_len * (1 - range_ratio), " - "input_len * (1 + range_ratio)]. For SonnetDataset, " - f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) " - "controls how much of the input is fixed lines versus " - "random lines, but the total input length remains approximately " - "input_len tokens.", - ) - # random dataset - parser.add_argument( - "--random-range-ratio", - type=float, - default=None, - help=f"Range ratio (default : {RandomDataset.DEFAULT_RANGE_RATIO}) " - "for sampling input/output length, " - "used only for RandomDataset. Must be in the range [0, 1) to " - "define a symmetric sampling range " - "[length * (1 - range_ratio), length * (1 + range_ratio)].", - ) - - # hf dtaset - parser.add_argument( - "--hf-subset", type=str, default=None, help="Subset of the HF dataset." - ) - parser.add_argument( - "--hf-split", type=str, default=None, help="Split of the HF dataset." - ) - - parser = AsyncEngineArgs.add_cli_args(parser) - - return parser - +import sys if __name__ == "__main__": - parser = create_argument_parser() - args = parser.parse_args() - if args.tokenizer is None: - args.tokenizer = args.model - validate_args(args) - main(args) + print("""DEPRECATED: This script has been moved to the vLLM CLI. + +Please use the following command instead: + vllm bench throughput + +For help with the new command, run: + vllm bench throughput --help + +Alternatively, you can run the new command directly with: + python -m vllm.entrypoints.cli.main bench throughput --help +""") + sys.exit(1) diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh index 92f97ffab..2c72941cf 100644 --- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh +++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh @@ -62,7 +62,7 @@ benchmark() { --max-model-len 10000 \ --gpu-memory-utilization 0.6 \ --kv-transfer-config \ - '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & CUDA_VISIBLE_DEVICES=1 python3 \ @@ -72,7 +72,7 @@ benchmark() { --max-model-len 10000 \ --gpu-memory-utilization 0.6 \ --kv-transfer-config \ - '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' & wait_for_server 8100 wait_for_server 8200 diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh index af2bcba3e..0bbf7cd2b 100644 --- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh +++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh @@ -69,7 +69,7 @@ launch_disagg_prefill() { --max-model-len 10000 \ --gpu-memory-utilization 0.6 \ --kv-transfer-config \ - '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & CUDA_VISIBLE_DEVICES=1 python3 \ -m vllm.entrypoints.openai.api_server \ @@ -78,7 +78,7 @@ launch_disagg_prefill() { --max-model-len 10000 \ --gpu-memory-utilization 0.6 \ --kv-transfer-config \ - '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' & wait_for_server 8100 wait_for_server 8200 diff --git a/benchmarks/kernels/bench_block_fp8_gemm.py b/benchmarks/kernels/bench_block_fp8_gemm.py new file mode 100644 index 000000000..f1e504499 --- /dev/null +++ b/benchmarks/kernels/bench_block_fp8_gemm.py @@ -0,0 +1,145 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + apply_w8a8_block_fp8_linear, +) +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + CUTLASS_BLOCK_FP8_SUPPORTED, +) +from vllm.platforms import current_platform +from vllm.triton_utils import triton as vllm_triton + +assert current_platform.is_cuda(), ( + "Only support benchmarking w8a8 block fp8 kernel on CUDA device." +) + +# DeepSeek-V3 weight shapes +DEEPSEEK_V3_SHAPES = [ + (512 + 64, 7168), + (2112, 7168), + ((128 + 64) * 128, 7168), + (128 * (128 + 128), 512), + (7168, 16384), + (7168, 18432), + (18432 * 2, 7168), + (24576, 1536), + (12288, 7168), + (4096, 7168), + (7168, 2048), +] + + +def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass): + """Build runner function for w8a8 block fp8 matmul.""" + factor_for_scale = 1e-2 + + fp8_info = torch.finfo(torch.float8_e4m3fn) + fp8_max, fp8_min = fp8_info.max, fp8_info.min + + # Create random FP8 tensors + A_ref = (torch.rand(M, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max + + B_ref = (torch.rand(N, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max + B = B_ref.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) + + # Create scales + block_n, block_k = block_size[0], block_size[1] + n_tiles = (N + block_n - 1) // block_n + k_tiles = (K + block_k - 1) // block_k + + Bs = ( + torch.rand(n_tiles, k_tiles, dtype=torch.float32, device=device) + * factor_for_scale + ) + + # SM90 CUTLASS requires row-major format for scales + if use_cutlass and current_platform.is_device_capability(90): + Bs = Bs.T.contiguous() + + def run(): + if use_cutlass: + return apply_w8a8_block_fp8_linear( + A_ref, B, block_size, Bs, cutlass_block_fp8_supported=True + ) + else: + return apply_w8a8_block_fp8_linear( + A_ref, B, block_size, Bs, cutlass_block_fp8_supported=False + ) + + return run + + +# Determine available providers +available_providers = ["torch-bf16", "w8a8-block-fp8-triton"] +plot_title = "BF16 vs W8A8 Block FP8 GEMMs" + +if CUTLASS_BLOCK_FP8_SUPPORTED: + available_providers.append("w8a8-block-fp8-cutlass") + + +@vllm_triton.testing.perf_report( + vllm_triton.testing.Benchmark( + x_names=["batch_size"], + x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384], + x_log=False, + line_arg="provider", + line_vals=available_providers, + line_names=available_providers, + ylabel="TFLOP/s (larger is better)", + plot_name="BF16 vs W8A8 Block FP8 GEMMs", + args={}, + ) +) +def benchmark_tflops(batch_size, provider, N, K, block_size=(128, 128)): + M = batch_size + device = "cuda" + + quantiles = [0.5, 0.2, 0.8] + + if provider == "torch-bf16": + a = torch.randn((M, K), device=device, dtype=torch.bfloat16) + b = torch.randn((N, K), device=device, dtype=torch.bfloat16) + ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph( + lambda: torch.nn.functional.linear(a, b), quantiles=quantiles + ) + elif provider == "w8a8-block-fp8-triton": + run_w8a8_triton = build_w8a8_block_fp8_runner( + M, N, K, block_size, device, use_cutlass=False + ) + ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph( + lambda: run_w8a8_triton(), quantiles=quantiles + ) + elif provider == "w8a8-block-fp8-cutlass": + run_w8a8_cutlass = build_w8a8_block_fp8_runner( + M, N, K, block_size, device, use_cutlass=True + ) + ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph( + lambda: run_w8a8_cutlass(), quantiles=quantiles + ) + else: + raise ValueError(f"Unknown provider: {provider}") + + to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3) + return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms) + + +if __name__ == "__main__": + block_size = (128, 128) + + for N, K in DEEPSEEK_V3_SHAPES: + print(f"\nBenchmarking DeepSeek-V3, N={N} K={K}") + + print(f"TFLOP/s comparison (block_size={block_size}):") + benchmark_tflops.run( + print_data=True, + # show_plots=False, + # save_path=f"bench_w8a8_block_fp8_tflops_n{N}_k{K}", + N=N, + K=K, + block_size=block_size, + ) + + print("\nBenchmark finished!") diff --git a/benchmarks/kernels/benchmark_activation.py b/benchmarks/kernels/benchmark_activation.py new file mode 100644 index 000000000..93edbcc93 --- /dev/null +++ b/benchmarks/kernels/benchmark_activation.py @@ -0,0 +1,104 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# benchmark custom activation op performance +import itertools + +import torch + +import vllm.model_executor.layers.activation # noqa F401 +from vllm.model_executor.custom_op import CustomOp +from vllm.platforms import current_platform +from vllm.triton_utils import triton +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser + +batch_size_range = [1, 16, 32, 64, 128] +seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096] +intermediate_size = [3072, 9728, 12288] +configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size)) + + +def benchmark_activation( + batch_size: int, + seq_len: int, + intermediate_size: int, + provider: str, + func_name: str, + dtype: torch.dtype, +): + device = "cuda" + num_tokens = batch_size * seq_len + dim = intermediate_size + current_platform.seed_everything(42) + torch.set_default_device(device) + + if func_name == "gelu_and_mul": + layer = CustomOp.op_registry[func_name](approximate="none") + elif func_name == "gelu_and_mul_tanh": + layer = CustomOp.op_registry["gelu_and_mul"](approximate="tanh") + elif func_name == "fatrelu_and_mul": + threshold = 0.5 + layer = CustomOp.op_registry[func_name](threshold) + else: + layer = CustomOp.op_registry[func_name]() + + x = torch.randn(num_tokens, dim, dtype=dtype, device=device) + compiled_layer = torch.compile(layer.forward_native) + + if provider == "custom": + fn = lambda: layer(x) + elif provider == "compiled": + fn = lambda: compiled_layer(x) + + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + fn, quantiles=[0.5, 0.2, 0.8] + ) + return ms, max_ms, min_ms + + +if __name__ == "__main__": + parser = FlexibleArgumentParser(description="Benchmark the custom activation op.") + parser.add_argument( + "--func-name", + type=str, + choices=[ + "mul_and_silu", + "silu_and_mul", + "gelu_and_mul", + "gelu_and_mul_tanh", + "fatrelu_and_mul", + "swigluoai_and_mul", + "gelu_new", + "gelu_fast", + "quick_gelu", + ], + default="silu_and_mul", + ) + parser.add_argument( + "--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16" + ) + args = parser.parse_args() + assert args + + func_name = args.func_name + dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype] + + perf_report = triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["batch_size", "seq_len", "intermediate_size"], + x_vals=configs, + line_arg="provider", + line_vals=["custom", "compiled"], + line_names=["Custom OP", "Compiled"], + styles=[("blue", "-"), ("green", "-")], + ylabel="ms", + plot_name=f"{func_name}-op-performance", + args={}, + ) + ) + + perf_report( + lambda batch_size, seq_len, intermediate_size, provider: benchmark_activation( + batch_size, seq_len, intermediate_size, provider, func_name, dtype + ) + ).run(print_data=True) diff --git a/benchmarks/kernels/benchmark_device_communicators.py b/benchmarks/kernels/benchmark_device_communicators.py new file mode 100644 index 000000000..a61c17edc --- /dev/null +++ b/benchmarks/kernels/benchmark_device_communicators.py @@ -0,0 +1,486 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +Benchmark script for device communicators: +CustomAllreduce (oneshot, twoshot), PyNcclCommunicator, +and SymmMemCommunicator (multimem, two-shot). + +Usage: + torchrun --nproc_per_node= benchmark_device_communicators.py [options] + +Example: + torchrun --nproc_per_node=2 benchmark_device_communicators.py + --sequence-lengths 512 1024 2048 --num-warmup 10 --num-trials 100 +""" + +import json +import os +import time +from contextlib import nullcontext +from typing import Callable, Optional + +import torch +import torch.distributed as dist +from torch.distributed import ProcessGroup + +from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce +from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator +from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator +from vllm.logger import init_logger +from vllm.utils import FlexibleArgumentParser + +logger = init_logger(__name__) + +# Default sequence lengths to benchmark +DEFAULT_SEQUENCE_LENGTHS = [128, 512, 1024, 2048, 4096, 8192] + +# Fixed hidden size and dtype for all benchmarks +HIDDEN_SIZE = 8192 +BENCHMARK_DTYPE = torch.bfloat16 + +# CUDA graph settings +CUDA_GRAPH_CAPTURE_CYCLES = 10 + + +class CommunicatorBenchmark: + """Benchmark class for testing device communicators.""" + + def __init__( + self, + rank: int, + world_size: int, + device: torch.device, + cpu_group: ProcessGroup, + sequence_lengths: list[int], + ): + self.rank = rank + self.world_size = world_size + self.device = device + self.cpu_group = cpu_group + + # Calculate max_size_override based on largest sequence length + max_seq_len = max(sequence_lengths) + max_tensor_elements = max_seq_len * HIDDEN_SIZE + self.max_size_override = max_tensor_elements * BENCHMARK_DTYPE.itemsize + 1 + + # Initialize communicators + self.custom_allreduce = None + self.pynccl_comm = None + self.symm_mem_comm = None + self.symm_mem_comm_multimem = None + self.symm_mem_comm_two_shot = None + + self._init_communicators() + + def _init_communicators(self): + """Initialize all available communicators.""" + try: + self.custom_allreduce = CustomAllreduce( + group=self.cpu_group, + device=self.device, + max_size=self.max_size_override, + ) + if not self.custom_allreduce.disabled: + logger.info("Rank %s: CustomAllreduce initialized", self.rank) + else: + logger.info("Rank %s: CustomAllreduce disabled", self.rank) + except Exception as e: + logger.warning( + "Rank %s: Failed to initialize CustomAllreduce: %s", self.rank, e + ) + self.custom_allreduce = None + + try: + self.pynccl_comm = PyNcclCommunicator( + group=self.cpu_group, device=self.device + ) + if not self.pynccl_comm.disabled: + logger.info("Rank %s: PyNcclCommunicator initialized", self.rank) + else: + logger.info("Rank %s: PyNcclCommunicator disabled", self.rank) + self.pynccl_comm = None + except Exception as e: + logger.warning( + "Rank %s: Failed to initialize PyNcclCommunicator: %s", self.rank, e + ) + self.pynccl_comm = None + + # Initialize variants for SymmMemCommunicator + try: + self.symm_mem_comm_multimem = SymmMemCommunicator( + group=self.cpu_group, + device=self.device, + force_multimem=True, + max_size_override=self.max_size_override, + ) + if not self.symm_mem_comm_multimem.disabled: + logger.info( + "Rank %s: SymmMemCommunicator (multimem) initialized", self.rank + ) + else: + self.symm_mem_comm_multimem = None + except Exception as e: + logger.warning( + "Rank %s: Failed to initialize SymmMemCommunicator (multimem): %s", + self.rank, + e, + ) + self.symm_mem_comm_multimem = None + + try: + self.symm_mem_comm_two_shot = SymmMemCommunicator( + group=self.cpu_group, + device=self.device, + force_multimem=False, + max_size_override=self.max_size_override, + ) + if not self.symm_mem_comm_two_shot.disabled: + logger.info( + "Rank %s: SymmMemCommunicator (two_shot) initialized", self.rank + ) + else: + self.symm_mem_comm_two_shot = None + except Exception as e: + logger.warning( + "Rank %s: Failed to initialize SymmMemCommunicator (two_shot): %s", + self.rank, + e, + ) + self.symm_mem_comm_two_shot = None + + def benchmark_allreduce( + self, sequence_length: int, num_warmup: int, num_trials: int + ) -> dict[str, float]: + """Benchmark allreduce operations for all available communicators.""" + + results = {} + + # Define communicators with their benchmark functions + communicators = [] + + if self.custom_allreduce is not None: + comm = self.custom_allreduce + # CustomAllreduce one-shot + communicators.append( + ( + "ca_1stage", + lambda t, c=comm: c.custom_all_reduce(t), + lambda t, c=comm: c.should_custom_ar(t), + comm.capture(), + "1stage", # env variable value + ) + ) + # CustomAllreduce two-shot + communicators.append( + ( + "ca_2stage", + lambda t, c=comm: c.custom_all_reduce(t), + lambda t, c=comm: c.should_custom_ar(t), + comm.capture(), + "2stage", # env variable value + ) + ) + + if self.pynccl_comm is not None: + comm = self.pynccl_comm + communicators.append( + ( + "pynccl", + lambda t, c=comm: c.all_reduce(t), + lambda t: True, # Always available if initialized + nullcontext(), + None, # no env variable needed + ) + ) + + if self.symm_mem_comm_multimem is not None: + comm = self.symm_mem_comm_multimem + communicators.append( + ( + "symm_mem_multimem", + lambda t, c=comm: c.all_reduce(t), + lambda t, c=comm: c.should_use_symm_mem(t), + nullcontext(), + None, # no env variable needed + ) + ) + + if self.symm_mem_comm_two_shot is not None: + comm = self.symm_mem_comm_two_shot + communicators.append( + ( + "symm_mem_two_shot", + lambda t, c=comm: c.all_reduce(t), + lambda t, c=comm: c.should_use_symm_mem(t), + nullcontext(), + None, # no env variable needed + ) + ) + + # Benchmark each communicator + for name, allreduce_fn, should_use_fn, context, env_var in communicators: + # Set environment variable if needed + if env_var is not None: + os.environ["VLLM_CUSTOM_ALLREDUCE_ALGO"] = env_var + else: + # Clear the environment variable to avoid interference + os.environ.pop("VLLM_CUSTOM_ALLREDUCE_ALGO", None) + + latency = self.benchmark_allreduce_single( + sequence_length, + allreduce_fn, + should_use_fn, + context, + num_warmup, + num_trials, + ) + if latency is not None: + results[name] = latency + + return results + + def benchmark_allreduce_single( + self, + sequence_length: int, + allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]], + should_use_fn: Callable[[torch.Tensor], bool], + context, + num_warmup: int, + num_trials: int, + ) -> Optional[float]: + """Benchmark method with CUDA graph optimization.""" + try: + # Create test tensor (2D: sequence_length x hidden_size) + tensor = torch.randn( + sequence_length, HIDDEN_SIZE, dtype=BENCHMARK_DTYPE, device=self.device + ) + if not should_use_fn(tensor): + return None + + torch.cuda.synchronize() + stream = torch.cuda.Stream() + with torch.cuda.stream(stream): + graph_input = tensor.clone() + + # Warmup before capture + for _ in range(3): + allreduce_fn(graph_input) + + # Capture the graph using context manager + with context: + graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(graph): + for _ in range(CUDA_GRAPH_CAPTURE_CYCLES): + allreduce_fn(graph_input) + + torch.cuda.synchronize() + for _ in range(num_warmup): + graph.replay() + torch.cuda.synchronize() + + torch.cuda.synchronize() + start_time = time.perf_counter() + + for _ in range(num_trials): + graph.replay() + torch.cuda.synchronize() + + end_time = time.perf_counter() + + # Convert to ms and divide by CUDA_GRAPH_CAPTURE_CYCLES + return ( + (end_time - start_time) / num_trials / CUDA_GRAPH_CAPTURE_CYCLES * 1000 + ) + + except Exception as e: + logger.error("CUDA graph benchmark failed: %s", e) + raise RuntimeError( + f"CUDA graph benchmark failed for communicator: {e}" + ) from e + + +def _calculate_speedup_info(comm_results: dict[str, float]) -> str: + """Calculate speedup information for a single tensor size.""" + if not comm_results: + return "N/A" + + # Find the fastest communicator + fastest_comm = min(comm_results.keys(), key=lambda k: comm_results[k]) + fastest_time = comm_results[fastest_comm] + + # Calculate speedup vs PyNccl if available + if "pynccl" in comm_results: + pynccl_time = comm_results["pynccl"] + speedup = pynccl_time / fastest_time + return f"{fastest_comm} ({speedup:.2f}x)" + else: + return f"{fastest_comm} (N/A)" + + +def print_results( + results: dict[str, dict[str, float]], sequence_lengths: list[int], world_size: int +): + """Print benchmark results in a formatted table.""" + + print(f"\n{'=' * 130}") + print("Device Communicator Benchmark Results") + print( + f"World Size: {world_size}, Data Type: {BENCHMARK_DTYPE}, " + f"Hidden Size: {HIDDEN_SIZE}" + ) + print(f"{'=' * 130}") + + # Get all communicator names + all_comms = set() + for size_results in results.values(): + all_comms.update(size_results.keys()) + + all_comms = sorted(list(all_comms)) + + # Print header + header = f"{'Tensor Shape':<20}{'Tensor Size':<15}" + for comm in all_comms: + header += f"{comm:<20}" + header += f"{'Best (Speedup vs PyNccl)':<30}" + print(header) + print("-" * len(header)) + + # Print results for each sequence length + for seq_len in sequence_lengths: + if seq_len in results: + # Calculate tensor size in elements and bytes + tensor_elements = seq_len * HIDDEN_SIZE + tensor_bytes = tensor_elements * BENCHMARK_DTYPE.itemsize + + # Format tensor size (MB) + tensor_size_mb = tensor_bytes / (1024 * 1024) + tensor_size_str = f"{tensor_size_mb:.2f} MB" + + # Format tensor shape + tensor_shape = f"({seq_len}, {HIDDEN_SIZE})" + + row = f"{tensor_shape:<20}{tensor_size_str:<15}" + for comm in all_comms: + if comm in results[seq_len]: + row += f"{results[seq_len][comm]:<20.3f}" + else: + row += f"{'N/A':<20}" + + # Calculate speedup information + speedup_info = _calculate_speedup_info(results[seq_len]) + row += f"{speedup_info:<30}" + + print(row) + + print(f"{'=' * 130}") + print("All times are in milliseconds (ms) per allreduce operation") + print("Speedup column shows: fastest_algorithm (speedup_vs_pynccl)") + + +def main(): + parser = FlexibleArgumentParser(description="Benchmark device communicators") + + parser.add_argument( + "--sequence-lengths", + type=int, + nargs="+", + default=DEFAULT_SEQUENCE_LENGTHS, + help="Sequence lengths to benchmark (tensor shape: seq_len x hidden_size)", + ) + + parser.add_argument( + "--num-warmup", type=int, default=5, help="Number of warmup iterations" + ) + + parser.add_argument( + "--num-trials", type=int, default=50, help="Number of benchmark trials" + ) + + parser.add_argument("--output-json", type=str, help="Output results to JSON file") + + args = parser.parse_args() + + # Initialize distributed + if not dist.is_initialized(): + dist.init_process_group(backend="gloo") + rank = dist.get_rank() + world_size = dist.get_world_size() + + # Set device + device = torch.device(f"cuda:{rank}") + torch.cuda.set_device(device) + + # Get CPU process group + cpu_group = dist.new_group(backend="gloo") + + # Disable USE_SYMM_MEM to avoid affecting the max_sizes + # in symm_mem and custom_all_reduce for benchmark + os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0" + + # Initialize benchmark + benchmark = CommunicatorBenchmark( + rank, world_size, device, cpu_group, args.sequence_lengths + ) + + # Run benchmarks + all_results = {} + + for seq_len in args.sequence_lengths: + if rank == 0: + logger.info( + "Benchmarking sequence length: %s (tensor shape: %s x %s)", + seq_len, + seq_len, + HIDDEN_SIZE, + ) + + results = benchmark.benchmark_allreduce( + sequence_length=seq_len, + num_warmup=args.num_warmup, + num_trials=args.num_trials, + ) + + all_results[seq_len] = results + + # Synchronize between ranks + dist.barrier() + + # Print results (only rank 0) + if rank == 0: + print_results(all_results, args.sequence_lengths, world_size) + + # Save to JSON if requested + if args.output_json: + # Add speedup information to results + enhanced_results = {} + for seq_len, comm_results in all_results.items(): + enhanced_results[seq_len] = { + "timings": comm_results, + "speedup_info": _calculate_speedup_info(comm_results), + } + + output_data = { + "world_size": world_size, + "dtype": str(BENCHMARK_DTYPE), + "hidden_size": HIDDEN_SIZE, + "sequence_lengths": args.sequence_lengths, + "num_warmup": args.num_warmup, + "num_trials": args.num_trials, + "cuda_graph_capture_cycles": CUDA_GRAPH_CAPTURE_CYCLES, + "results": enhanced_results, + } + + with open(args.output_json, "w") as f: + json.dump(output_data, f, indent=2) + + logger.info("Results saved to %s", args.output_json) + + # Cleanup + if cpu_group != dist.group.WORLD: + dist.destroy_process_group(cpu_group) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py index 3d38d4b35..89309c79f 100644 --- a/benchmarks/kernels/benchmark_lora.py +++ b/benchmarks/kernels/benchmark_lora.py @@ -637,7 +637,7 @@ def bench_optype( # Clear LoRA optimization hash-maps. _LORA_A_PTR_DICT.clear() _LORA_B_PTR_DICT.clear() - # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are setup + # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up for kwargs in kwargs_list: op_type.bench_fn()(**kwargs) torch.cuda.synchronize() diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 752c2d008..94f3f1ae1 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -419,8 +419,10 @@ class BenchmarkWorker: ) # NOTE(woosuk): The current naming convention uses w2.shape[2], which # is the intermediate size after silu_and_mul. + block_n = block_quant_shape[0] if block_quant_shape else None + block_k = block_quant_shape[1] if block_quant_shape else None op_config = get_moe_configs( - num_experts, shard_intermediate_size // 2, dtype_str + num_experts, shard_intermediate_size // 2, dtype_str, block_n, block_k ) if op_config is None: config = get_default_config( @@ -430,6 +432,7 @@ class BenchmarkWorker: hidden_size, topk, dtype_str, + block_quant_shape, ) else: config = op_config[min(op_config.keys(), key=lambda x: abs(x - num_tokens))] @@ -591,7 +594,11 @@ def main(args: argparse.Namespace): E = config.n_routed_experts topk = config.num_experts_per_tok intermediate_size = config.moe_intermediate_size - elif config.architectures[0] in ("Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"): + elif config.architectures[0] in ( + "Qwen2MoeForCausalLM", + "Qwen3MoeForCausalLM", + "Qwen3NextForCausalLM", + ): E = config.num_experts topk = config.num_experts_per_tok intermediate_size = config.moe_intermediate_size @@ -675,7 +682,11 @@ def main(args: argparse.Namespace): is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16) search_space = get_configs_compute_bound(is_fp16, block_quant_shape) print(f"Start tuning over {len(search_space)} configurations...") - + if use_deep_gemm: + raise ValueError( + "Tuning with --use-deep-gemm is not supported as it only tunes Triton " + "kernels. Please remove the flag." + ) start = time.time() configs = _distribute( "tune", diff --git a/benchmarks/kernels/benchmark_polynorm.py b/benchmarks/kernels/benchmark_polynorm.py new file mode 100644 index 000000000..9ac8f5e65 --- /dev/null +++ b/benchmarks/kernels/benchmark_polynorm.py @@ -0,0 +1,155 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import itertools + +import torch + +from vllm import _custom_ops as vllm_ops +from vllm.triton_utils import triton + + +def polynorm_naive( + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + eps: float = 1e-6, +): + orig_shape = x.shape + x = x.view(-1, x.shape[-1]) + + def norm(x, eps: float): + return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps) + + x = x.float() + return ( + ( + weight[0] * norm(x**3, eps) + + weight[1] * norm(x**2, eps) + + weight[2] * norm(x, eps) + + bias + ) + .to(weight.dtype) + .view(orig_shape) + ) + + +def polynorm_vllm( + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + eps: float = 1e-6, +): + orig_shape = x.shape + x = x.view(-1, x.shape[-1]) + + out = torch.empty_like(x) + vllm_ops.poly_norm(out, x, weight, bias, eps) + output = out + + output = output.view(orig_shape) + return output + + +def calculate_diff(batch_size, seq_len, hidden_dim): + dtype = torch.bfloat16 + x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda") + weight = torch.ones(3, dtype=dtype, device="cuda") + bias = torch.ones(1, dtype=dtype, device="cuda") + + output_naive = polynorm_naive(x, weight, bias) + output_vllm = polynorm_vllm(x, weight, bias) + + if torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2): + print("✅ All implementations match") + else: + print("❌ Implementations differ") + + +batch_size_range = [2**i for i in range(0, 7, 2)] +seq_length_range = [2**i for i in range(6, 11, 1)] +dim_range = [2048, 4096] +configs = list(itertools.product(dim_range, batch_size_range, seq_length_range)) + + +def get_benchmark(): + @triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["dim", "batch_size", "seq_len"], + x_vals=[list(_) for _ in configs], + line_arg="provider", + line_vals=["naive", "vllm"], + line_names=["Naive", "vLLM"], + styles=[("blue", "-"), ("red", "-")], + ylabel="us", + plot_name="polynorm-perf", + args={}, + ) + ) + def benchmark(dim, batch_size, seq_len, provider): + dtype = torch.bfloat16 + hidden_dim = dim * 4 + + x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda") + weight = torch.ones(3, dtype=dtype, device="cuda") + bias = torch.ones(1, dtype=dtype, device="cuda") + + quantiles = [0.5, 0.2, 0.8] + + if provider == "naive": + ms, min_ms, max_ms = triton.testing.do_bench( + lambda: polynorm_naive(x, weight, bias), + quantiles=quantiles, + ) + else: + ms, min_ms, max_ms = triton.testing.do_bench( + lambda: polynorm_vllm(x, weight, bias), + quantiles=quantiles, + ) + + return 1000 * ms, 1000 * max_ms, 1000 * min_ms + + return benchmark + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument( + "--batch-size", + type=int, + default=4, + help="Batch size", + ) + parser.add_argument( + "--seq-len", + type=int, + default=128, + help="Sequence length", + ) + parser.add_argument( + "--hidden-dim", + type=int, + default=8192, + help="Intermediate size of MLP", + ) + parser.add_argument( + "--save-path", + type=str, + default="./configs/polnorm/", + help="Path to save polnorm benchmark results", + ) + + args = parser.parse_args() + + # Run correctness test + calculate_diff( + batch_size=args.batch_size, + seq_len=args.seq_len, + hidden_dim=args.hidden_dim, + ) + + benchmark = get_benchmark() + # Run performance benchmark + benchmark.run(print_data=True, save_path=args.save_path) diff --git a/benchmarks/kernels/benchmark_trtllm_decode_attention.py b/benchmarks/kernels/benchmark_trtllm_decode_attention.py index 603ce5ecf..6ddab4621 100644 --- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py +++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py @@ -259,6 +259,7 @@ if __name__ == "__main__": # (q_quant_dtype, kv_quant_dtype, o_quant_dtype) (None, None, None), (None, FP8_DTYPE, None), + (FP8_DTYPE, FP8_DTYPE, None), (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE), (FP8_DTYPE, FP8_DTYPE, FP4_DTYPE), ] diff --git a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py index 40903c6c3..131df74c7 100644 --- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py +++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py @@ -274,6 +274,7 @@ if __name__ == "__main__": quant_dtypes = [ # (q_quant_dtype, kv_quant_dtype, o_quant_dtype) (None, None, None), + (FP8_DTYPE, FP8_DTYPE, None), (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE), (FP8_DTYPE, FP8_DTYPE, FP4_DTYPE), ] diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py index e648a9107..df2b713e4 100644 --- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py +++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py @@ -56,7 +56,7 @@ def w8a8_block_matmul( Bs: The per-block quantization scale for `B`. block_size: The block size for per-block quantization. It should be 2-dim, e.g., [128, 128]. - output_dytpe: The dtype of the returned tensor. + output_dtype: The dtype of the returned tensor. Returns: torch.Tensor: The result of matmul. @@ -141,6 +141,7 @@ def get_weight_shapes(tp_size): # cannot TP total = [ (512 + 64, 7168), + (2112, 7168), ((128 + 64) * 128, 7168), (128 * (128 + 128), 512), (7168, 16384), diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py index d23b7b6e4..66d85eaf5 100644 --- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py +++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py @@ -962,7 +962,7 @@ async def main_mp( # At this point all the clients finished, # collect results (TTFT, TPOT, etc.) from all the clients. - # This needs to happens before calling join on the clients + # This needs to happen before calling join on the clients # (result_queue should be emptied). while not result_queue.empty(): client_metrics.append(result_queue.get()) diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index 52bfd82c7..064944632 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -88,6 +88,7 @@ is_avx512_disabled(AVX512_DISABLED) if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64") message(STATUS "Apple Silicon Detected") + set(APPLE_SILICON_FOUND TRUE) set(ENABLE_NUMA OFF) check_sysctl(hw.optional.neon ASIMD_FOUND) check_sysctl(hw.optional.arm.FEAT_BF16 ARM_BF16_FOUND) @@ -189,7 +190,7 @@ else() set(USE_ACL OFF) endif() -if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND) +if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND) FetchContent_Declare( oneDNN GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake index 49defccbb..3d32121f1 100644 --- a/cmake/external_projects/vllm_flash_attn.cmake +++ b/cmake/external_projects/vllm_flash_attn.cmake @@ -38,7 +38,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 57b4e68b9f9d94750b46de8f8dbd2bfcc86edd4f + GIT_TAG ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn diff --git a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu index 6dd6f269f..c60f1823b 100644 --- a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu +++ b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu @@ -36,6 +36,7 @@ limitations under the License. #if !defined(CUDA_VERSION) || CUDA_VERSION < 12040 void sm100_cutlass_mla_decode( torch::Tensor const& out, + torch::Tensor const& lse, torch::Tensor const& q_nope, torch::Tensor const& q_pe, torch::Tensor const& kv_c_and_k_pe_cache, @@ -64,11 +65,11 @@ struct IsPersistent { static const bool value = v; }; -template > +template > struct MlaSm100 { using Element = T; using ElementAcc = float; - using ElementOut = T; + using ElementOut = TOut; using TileShape = Shape<_128, _128, Shape<_512, _64>>; using TileShapeH = cute::tuple_element_t<0, TileShape>; @@ -99,6 +100,7 @@ struct MlaSm100 { template typename T::Fmha::Arguments args_from_options( at::Tensor const& out, + at::Tensor const& lse, at::Tensor const& q_nope, at::Tensor const& q_pe, at::Tensor const& kv_c_and_k_pe_cache, @@ -162,7 +164,10 @@ typename T::Fmha::Arguments args_from_options( stride_PT, page_count_total, page_size}, - {static_cast(out.data_ptr()), stride_O, static_cast(nullptr), stride_LSE}, + {static_cast(out.data_ptr()), + stride_O, + static_cast(lse.defined() ? lse.data_ptr() : nullptr), + stride_LSE}, hw_info, // TODO(trevor-m): Change split_kv back to -1 when // https://github.com/NVIDIA/cutlass/issues/2274 is fixed. Split_kv=1 will @@ -178,9 +183,10 @@ typename T::Fmha::Arguments args_from_options( return arguments; } -template +template void runMla( at::Tensor const& out, + at::Tensor const& lse, at::Tensor const& q_nope, at::Tensor const& q_pe, at::Tensor const& kv_c_and_k_pe_cache, @@ -190,9 +196,9 @@ void runMla( double sm_scale, int64_t num_kv_splits, cudaStream_t stream) { - using MlaSm100Type = MlaSm100; + using MlaSm100Type = MlaSm100; typename MlaSm100Type::Fmha fmha; - auto arguments = args_from_options(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits); + auto arguments = args_from_options(out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits); CUTLASS_CHECK(fmha.can_implement(arguments)); @@ -214,6 +220,7 @@ void runMla( void sm100_cutlass_mla_decode( torch::Tensor const& out, + torch::Tensor const& lse, torch::Tensor const& q_nope, torch::Tensor const& q_pe, torch::Tensor const& kv_c_and_k_pe_cache, @@ -233,14 +240,14 @@ void sm100_cutlass_mla_decode( DISPATCH_BOOL(page_size == 128, IsPaged128, [&] { DISPATCH_BOOL(num_kv_splits <= 1, NotManualSplitKV, [&] { if (in_dtype == at::ScalarType::Half) { - runMla>( - out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream); + runMla>( + out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream); } else if (in_dtype == at::ScalarType::BFloat16) { - runMla>( - out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream); + runMla>( + out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream); } else if (in_dtype == at::ScalarType::Float8_e4m3fn) { - runMla>( - out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream); + runMla>( + out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream); } else { TORCH_CHECK(false, "Unsupported input data type of MLA"); } @@ -253,7 +260,7 @@ void sm100_cutlass_mla_decode( int64_t sm100_cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_batches, int64_t sm_count, int64_t num_kv_splits) { // Workspace size depends on ElementAcc and ElementLSE (same as ElementAcc) // which are float, so Element type here doesn't matter. - using MlaSm100Type = MlaSm100; + using MlaSm100Type = MlaSm100; // Get split kv. Requires problem shape and sm_count only. typename MlaSm100Type::Fmha::Arguments arguments; diff --git a/csrc/cache.h b/csrc/cache.h index fb0c353b9..fd230bec2 100644 --- a/csrc/cache.h +++ b/csrc/cache.h @@ -47,4 +47,12 @@ void gather_and_maybe_dequant_cache( torch::Tensor const& cu_seq_lens, // [BATCH+1] int64_t batch_size, const std::string& kv_cache_dtype, torch::Tensor const& scale, - std::optional seq_starts = std::nullopt); \ No newline at end of file + std::optional seq_starts = std::nullopt); + +// TODO(hc): cp_gather_cache need support scaled kvcahe in the future. +void cp_gather_cache( + torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...] + torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...] + torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES] + torch::Tensor const& cu_seq_lens, // [BATCH+1] + int64_t batch_size, std::optional seq_starts = std::nullopt); diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 21381c30d..03db59ec9 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -1,6 +1,7 @@ #include #include #include +#include #include "cuda_utils.h" #include "cuda_compat.h" @@ -779,3 +780,145 @@ void gather_and_maybe_dequant_cache( DISPATCH_BY_KV_CACHE_DTYPE(dst.dtype(), kv_cache_dtype, CALL_GATHER_CACHE); } + +namespace vllm { +template +// Note(hc): The cp_gather_cache allows seq_starts to no longer be divisible by +// block_size. +__global__ void cp_gather_cache( + const scalar_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE, + // ENTRY_SIZE] + scalar_t* __restrict__ dst, // [TOT_TOKENS, ENTRY_SIZE] + const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES] + const int32_t* __restrict__ cu_seq_lens, // [BATCH+1] + const int32_t block_size, const int32_t entry_size, + const int64_t block_table_stride, const int64_t cache_block_stride, + const int64_t cache_entry_stride, const int64_t dst_entry_stride, + const int32_t* __restrict__ seq_starts // Optional: starting offsets per + // batch +) { + const int64_t bid = blockIdx.x; // Batch ID + const int32_t num_splits = gridDim.y; + const int32_t split = blockIdx.y; + const int32_t seq_start = cu_seq_lens[bid]; + const int32_t seq_end = cu_seq_lens[bid + 1]; + const int32_t seq_len = seq_end - seq_start; + const int32_t tot_slots = seq_len; + const int32_t split_slots = cuda_utils::ceil_div(tot_slots, num_splits); + + const int32_t split_start = split * split_slots; + const int32_t split_end = min((split + 1) * split_slots, tot_slots); + + const bool is_active_split = (split_start < tot_slots); + + if (!is_active_split) return; + + // Adjust the pointer for the block_table for this batch. + // If seq_starts is provided, compute an offset based on it + const int32_t batch_offset = bid * block_table_stride; + int32_t offset = split_start; + if (seq_starts != nullptr) { + offset += seq_starts[bid]; + } + int32_t offset_div = offset / block_size; + offset = offset % block_size; + const int32_t* batch_block_table = block_table + batch_offset; + + // Adjust dst pointer based on the cumulative sequence lengths. + dst += seq_start * dst_entry_stride; + + auto copy_entry = [&](const scalar_t* __restrict__ _src, + scalar_t* __restrict__ _dst) { + for (int i = threadIdx.x; i < entry_size; i += blockDim.x) + _dst[i] = _src[i]; + }; + + for (int pid = split_start; pid < split_end; ++pid) { + auto block_id = batch_block_table[offset_div]; + auto block_start_ptr = src_cache + block_id * cache_block_stride; + auto block_dst_ptr = dst + pid * dst_entry_stride; + copy_entry(block_start_ptr + offset * cache_entry_stride, block_dst_ptr); + offset += 1; + // bump to next block + if (offset == block_size) { + offset_div += 1; + offset = 0; + } + } +} +} // namespace vllm + +// Macro to dispatch the kernel based on the data type. +#define CALL_CP_GATHER_CACHE(CPY_DTYPE) \ + vllm::cp_gather_cache<<>>( \ + reinterpret_cast(src_cache.data_ptr()), \ + reinterpret_cast(dst.data_ptr()), \ + block_table.data_ptr(), cu_seq_lens.data_ptr(), \ + block_size, entry_size, block_table_stride, cache_block_stride, \ + cache_entry_stride, dst_entry_stride, seq_starts_ptr); + +// Gather sequences from the cache into the destination tensor. +// - cu_seq_lens contains the cumulative sequence lengths for each batch +// - block_table contains the cache block indices for each sequence +// - Optionally, seq_starts (if provided) offsets the starting slot index by +// seq_starts[bid] +void cp_gather_cache( + torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...] + torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...] + torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES] + torch::Tensor const& cu_seq_lens, // [BATCH+1] + int64_t batch_size, + std::optional seq_starts = std::nullopt) { + at::cuda::OptionalCUDAGuard device_guard(src_cache.device()); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + int32_t block_size = src_cache.size(1); + int32_t entry_size = src_cache.flatten(2, -1).size(2); + + TORCH_CHECK(block_table.dtype() == torch::kInt32, + "block_table must be int32"); + TORCH_CHECK(cu_seq_lens.dtype() == torch::kInt32, + "cu_seq_lens must be int32"); + if (seq_starts.has_value()) { + TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32, + "seq_starts must be int32"); + } + + TORCH_CHECK(src_cache.device() == dst.device(), + "src_cache and dst must be on the same device"); + TORCH_CHECK(src_cache.device() == block_table.device(), + "src_cache and block_table must be on the same device"); + TORCH_CHECK(src_cache.device() == cu_seq_lens.device(), + "src_cache and cu_seq_lens must be on the same device"); + if (seq_starts.has_value()) { + TORCH_CHECK(src_cache.device() == seq_starts.value().device(), + "src_cache and seq_starts must be on the same device"); + } + + int64_t block_table_stride = block_table.stride(0); + int64_t cache_block_stride = src_cache.stride(0); + int64_t cache_entry_stride = src_cache.stride(1); + int64_t dst_entry_stride = dst.stride(0); + + // Decide on the number of splits based on the batch size. + int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16; + dim3 grid(batch_size, num_splits); + dim3 block(1024); + + TORCH_CHECK(src_cache.dtype() == dst.dtype(), + "src_cache and dst must have the same dtype"); + + const int dtype_bits = src_cache.element_size() * 8; + const int32_t* seq_starts_ptr = + seq_starts.has_value() ? seq_starts.value().data_ptr() : nullptr; + + if (dtype_bits == 32) { + CALL_CP_GATHER_CACHE(uint32_t); + } else if (dtype_bits == 16) { + CALL_CP_GATHER_CACHE(uint16_t); + } else if (dtype_bits == 8) { + CALL_CP_GATHER_CACHE(uint8_t); + } else { + TORCH_CHECK(false, "Unsupported data type width: ", dtype_bits); + } +} diff --git a/csrc/cpu/cpu_types_vxe.hpp b/csrc/cpu/cpu_types_vxe.hpp index ab8cbbbf4..51bca37e6 100644 --- a/csrc/cpu/cpu_types_vxe.hpp +++ b/csrc/cpu/cpu_types_vxe.hpp @@ -12,7 +12,7 @@ namespace vec_op { #define vec_sub(a, b) ((a) - (b)) #define vec_mul(a, b) ((a) * (b)) #define vec_div(a, b) ((a) / (b)) -#define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebaic +#define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebraic #define vec_sl(a, b) ((a) << (b)) // Vector Shift Left // FIXME: FP16 is not fully supported in Torch-CPU diff --git a/csrc/cpu/dnnl_helper.cpp b/csrc/cpu/dnnl_helper.cpp index f3f00edb3..6def0e061 100644 --- a/csrc/cpu/dnnl_helper.cpp +++ b/csrc/cpu/dnnl_helper.cpp @@ -22,6 +22,23 @@ void release_dnnl_matmul_handler(int64_t handler) { delete ptr; } +DNNLScratchPadManager::DNNLScratchPadManager() : size_(0), ptr_(nullptr) { + this->realloc(allocation_unit * 128); +} + +void DNNLScratchPadManager::realloc(size_t new_size) { + new_size = round(new_size); + if (new_size > size_) { + ptr_ = std::aligned_alloc(64, new_size); + size_ = new_size; + } +} + +DNNLScratchPadManager* DNNLScratchPadManager::get_dnnl_scratchpad_manager() { + static DNNLScratchPadManager manager; + return &manager; +} + template class DNNLPrimitiveCache { public: @@ -166,6 +183,23 @@ struct hash { hash()(static_cast(val.bias_type)); } }; + +template <> +struct hash { + size_t operator()( + const MatMulPrimitiveHandler::ClassMatmulCacheKey& val) const { + return hash()(val.b_n_size) ^ hash()(val.b_k_size); + } +}; + +template <> +struct hash { + size_t operator()(const MatMulPrimitiveHandler::MSizeCacheKey& val) const { + return hash()(val.a_m_size) ^ + hash()(val.a_m_stride) ^ hash()(val.use_bias) ^ + hash()(static_cast(val.bias_type)); + } +}; } // namespace std bool operator==(const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& l, @@ -181,6 +215,17 @@ bool operator==(const W8A8MatMulPrimitiveHandler::MSizeCacheKey& l, l.bias_type == r.bias_type; } +bool operator==(const MatMulPrimitiveHandler::ClassMatmulCacheKey& l, + const MatMulPrimitiveHandler::ClassMatmulCacheKey& r) { + return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size; +} + +bool operator==(const MatMulPrimitiveHandler::MSizeCacheKey& l, + const MatMulPrimitiveHandler::MSizeCacheKey& r) { + return l.a_m_size == r.a_m_size && l.a_m_stride == r.a_m_stride && + l.use_bias == r.use_bias && l.bias_type == r.bias_type; +} + static std::shared_ptr get_w8a8_class_primitive_cache( const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& key, @@ -239,6 +284,11 @@ void W8A8MatMulPrimitiveHandler::execute(ExecArgs& args) { } dnnl::matmul matmul = get_matmul_cache(args); + + auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(5); + scratchpad_storage->set_data_handle( + DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data()); + matmul.execute(default_stream(), memory_cache_); default_stream().wait(); } @@ -257,6 +307,8 @@ dnnl::matmul W8A8MatMulPrimitiveHandler::get_matmul_cache( return m_size_cache_->get_or_create(key, [&]() { dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false); + auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager(); + manager->realloc(desc.scratchpad_desc().get_size()); return dnnl::matmul(desc); }); } @@ -300,6 +352,11 @@ void W8A8MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) { dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}}, default_engine(), nullptr); set_runtime_memory_ptr(4, memory_cache_[DNNL_ARG_BIAS].get()); + + memory_cache_[DNNL_ARG_SCRATCHPAD] = + dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}}, + default_engine(), nullptr); + set_runtime_memory_ptr(5, memory_cache_[DNNL_ARG_SCRATCHPAD].get()); } dnnl::matmul::primitive_desc W8A8MatMulPrimitiveHandler::create_primitive_desc( @@ -319,6 +376,9 @@ dnnl::matmul::primitive_desc W8A8MatMulPrimitiveHandler::create_primitive_desc( dnnl::memory::format_tag::ab); dnnl::primitive_attr attr; + + attr.set_scratchpad_mode(dnnl::scratchpad_mode::user); + // For PER_TOKEN, scales will be applied in outside epilogue if (a_qs_ == QuantizationStrategy::PER_TENSOR) { attr.set_scales_mask(DNNL_ARG_SRC, 0); @@ -344,3 +404,120 @@ dnnl::matmul::primitive_desc W8A8MatMulPrimitiveHandler::create_primitive_desc( attr); } } + +MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args) + : DNNLMatMulPrimitiveHandler( + static_cast(args), args.ab_type), + m_size_cache_(nullptr) { + assert(ab_type_ == dnnl::memory::data_type::f32 || + ab_type_ == dnnl::memory::data_type::bf16 || + ab_type_ == dnnl::memory::data_type::f16); + prepack_weight(args.b_ptr, + create_primitive_desc( + MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL, + .a_m_stride = DNNL_RUNTIME_DIM_VAL, + .use_bias = false, + .bias_type = dnnl::memory::data_type::undef}, + true) + .weights_desc()); + init_runtime_memory_cache(args); +} + +static std::shared_ptr +get_matul_class_primitive_cache( + const MatMulPrimitiveHandler::ClassMatmulCacheKey& key, + int64_t cache_size) { + static MatMulPrimitiveHandler::ClassMatmulCache cache(128); + assert(cache_size > 0); + return cache.get_or_create(key, [&]() { + return std::make_shared(cache_size); + }); +} + +void MatMulPrimitiveHandler::execute(ExecArgs& args) { + auto&& [a_storage, a_mem_desc] = get_runtime_memory_ptr(0); + auto&& [c_storage, c_mem_desc] = get_runtime_memory_ptr(1); + a_storage->set_data_handle((void*)args.a_ptr); + a_mem_desc->dims[0] = args.a_m_size; + a_mem_desc->format_desc.blocking.strides[0] = args.a_m_stride; + c_storage->set_data_handle((void*)args.c_ptr); + c_mem_desc->dims[0] = args.a_m_size; + + if (args.use_bias) { + auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(2); + bias_storage->set_data_handle((void*)args.bias_ptr); + } + + dnnl::matmul matmul = get_matmul_cache(args); + + auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(3); + scratchpad_storage->set_data_handle( + DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data()); + + matmul.execute(default_stream(), memory_cache_); + default_stream().wait(); +} + +dnnl::matmul MatMulPrimitiveHandler::get_matmul_cache( + const MSizeCacheKey& key) { + if (m_size_cache_.get() == nullptr) { + ClassMatmulCacheKey key = {.b_n_size = b_n_size_, .b_k_size = b_k_size_}; + m_size_cache_ = get_matul_class_primitive_cache(key, primitive_cache_size_); + } + return m_size_cache_->get_or_create(key, [&]() { + dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false); + auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager(); + manager->realloc(desc.scratchpad_desc().get_size()); + return dnnl::matmul(desc); + }); +} + +dnnl::matmul::primitive_desc MatMulPrimitiveHandler::create_primitive_desc( + const MSizeCacheKey& key, bool first_time) { + dnnl::memory::desc a_md; + dnnl::memory::desc b_md; + if (first_time) { + a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_, + dnnl::memory::format_tag::ab); + b_md = dnnl::memory::desc({b_k_size_, b_n_size_}, b_type_, + dnnl::memory::format_tag::any); + } else { + a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_, + {key.a_m_stride, 1}); + b_md = b_target_mem_desc_; + } + dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_, + dnnl::memory::format_tag::ab); + + dnnl::primitive_attr attr; + attr.set_scratchpad_mode(dnnl::scratchpad_mode::user); + + if (key.use_bias) { + dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1}); + return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md, + c_md, attr); + } else { + return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md, + attr); + } +} + +void MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) { + memory_cache_[DNNL_ARG_SRC] = dnnl::memory( + {{1, b_k_size_}, b_type_, {b_k_size_, 1}}, default_engine(), nullptr); + set_runtime_memory_ptr(0, memory_cache_[DNNL_ARG_SRC].get()); + memory_cache_[DNNL_ARG_DST] = + dnnl::memory({{1, b_n_size_}, c_type_, dnnl::memory::format_tag::ab}, + default_engine(), nullptr); + set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get()); + + memory_cache_[DNNL_ARG_BIAS] = + dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}}, + default_engine(), nullptr); + set_runtime_memory_ptr(2, memory_cache_[DNNL_ARG_BIAS].get()); + + memory_cache_[DNNL_ARG_SCRATCHPAD] = + dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}}, + default_engine(), nullptr); + set_runtime_memory_ptr(3, memory_cache_[DNNL_ARG_SCRATCHPAD].get()); +} diff --git a/csrc/cpu/dnnl_helper.h b/csrc/cpu/dnnl_helper.h index 54ceefced..ad6773d2b 100644 --- a/csrc/cpu/dnnl_helper.h +++ b/csrc/cpu/dnnl_helper.h @@ -59,6 +59,30 @@ constexpr inline dnnl::memory::data_type get_dnnl_type() { return DNNLType>::type; } +class DNNLScratchPadManager { + public: + static constexpr size_t allocation_unit = 4 * 1024 * 1024; // 4KB + + static DNNLScratchPadManager* get_dnnl_scratchpad_manager(); + + DNNLScratchPadManager(); + + template + T* get_data() { + return reinterpret_cast(ptr_); + } + + static size_t round(size_t size) { + return ((size + allocation_unit - 1) / allocation_unit) * allocation_unit; + } + + void realloc(size_t new_size); + + private: + size_t size_; + void* ptr_; +}; + class DNNLMatMulPrimitiveHandler { public: virtual ~DNNLMatMulPrimitiveHandler() = default; @@ -166,4 +190,54 @@ class W8A8MatMulPrimitiveHandler : public DNNLMatMulPrimitiveHandler { std::shared_ptr m_size_cache_; }; +class MatMulPrimitiveHandler : public DNNLMatMulPrimitiveHandler { + public: + struct Args : public DNNLMatMulPrimitiveHandler::Args { + dnnl::memory::data_type ab_type; + }; + + struct ClassMatmulCacheKey { + dnnl_dim_t b_n_size; + dnnl_dim_t b_k_size; + + friend bool operator==(const ClassMatmulCacheKey& l, + const ClassMatmulCacheKey& r); + }; + + struct MSizeCacheKey { + dnnl_dim_t a_m_size; + dnnl_dim_t a_m_stride; + bool use_bias; + dnnl::memory::data_type bias_type; + + friend bool operator==(const MSizeCacheKey& l, const MSizeCacheKey& r); + }; + + using MSizeCache = DNNLPrimitiveCache; + using ClassMatmulCache = + DNNLPrimitiveCache>; + + struct ExecArgs : public MSizeCacheKey { + const void* a_ptr; + const void* bias_ptr; + void* c_ptr; + }; + + public: + MatMulPrimitiveHandler(const Args& args); + + void execute(ExecArgs& args); + + private: + dnnl::matmul::primitive_desc create_primitive_desc(const MSizeCacheKey& key, + bool first_time); + + void init_runtime_memory_cache(const Args& args); + + dnnl::matmul get_matmul_cache(const MSizeCacheKey& key); + + private: + std::shared_ptr m_size_cache_; +}; + #endif diff --git a/csrc/cpu/dnnl_kernels.cpp b/csrc/cpu/dnnl_kernels.cpp index acc3b9ecd..9a3af4ac9 100644 --- a/csrc/cpu/dnnl_kernels.cpp +++ b/csrc/cpu/dnnl_kernels.cpp @@ -145,7 +145,8 @@ void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, } } - float scale_val, azp_val; + float scale_val; + float azp_val = 0.0f; if constexpr (AZP) { float max_scalar = max_value.reduce_max(); float min_scalar = min_value.reduce_min(); @@ -379,6 +380,7 @@ void onednn_scaled_mm( exec_args.a_ptr = a.data_ptr(); exec_args.a_m_size = a.size(0); exec_args.bias_ptr = nullptr; + exec_args.bias_type = get_dnnl_type(); exec_args.use_bias = false; exec_args.a_scales_ptr = nullptr; exec_args.a_zero_points_ptr = nullptr; @@ -492,3 +494,56 @@ void dynamic_scaled_int8_quant( } }); } + +int64_t create_onednn_mm_handler(const torch::Tensor& b, + int64_t primitive_cache_size) { + TORCH_CHECK(b.dim() == 2); + + MatMulPrimitiveHandler::Args args; + args.primitive_cache_size = primitive_cache_size; + + args.b_k_size = b.size(0); + args.b_k_stride = b.stride(0); + args.b_n_size = b.size(1); + args.b_n_stride = b.stride(1); + args.b_ptr = b.data_ptr(); + + VLLM_DISPATCH_FLOATING_TYPES(b.scalar_type(), "create_onednn_mm_handler", + [&] { + args.c_type = get_dnnl_type(); + args.ab_type = get_dnnl_type(); + }); + + return reinterpret_cast(new MatMulPrimitiveHandler(args)); +} + +void onednn_mm(torch::Tensor& c, // [M, OC], row-major + const torch::Tensor& a, // [M, IC], row-major + const std::optional& bias, int64_t handler) { + CPU_KERNEL_GUARD_IN(onednn_mm) + TORCH_CHECK(a.dim() == 2); + TORCH_CHECK(a.stride(-1) == 1); + TORCH_CHECK(c.is_contiguous()); + MatMulPrimitiveHandler* ptr = + reinterpret_cast(handler); + + MatMulPrimitiveHandler::ExecArgs exec_args; + exec_args.a_m_size = a.size(0); + exec_args.a_m_stride = a.stride(0); + + VLLM_DISPATCH_FLOATING_TYPES(a.scalar_type(), "onednn_mm", [&] { + if (bias.has_value()) { + exec_args.use_bias = true; + exec_args.bias_type = get_dnnl_type(); + exec_args.bias_ptr = bias->data_ptr(); + } else { + exec_args.use_bias = false; + exec_args.bias_type = get_dnnl_type(); + exec_args.bias_ptr = nullptr; + } + exec_args.a_ptr = a.data_ptr(); + exec_args.c_ptr = c.data_ptr(); + + ptr->execute(exec_args); + }); +} diff --git a/csrc/cpu/sgl-kernels/moe.cpp b/csrc/cpu/sgl-kernels/moe.cpp index beeccff78..94b24c2f1 100644 --- a/csrc/cpu/sgl-kernels/moe.cpp +++ b/csrc/cpu/sgl-kernels/moe.cpp @@ -215,7 +215,7 @@ int moe_align_block_size( offsets[mb + 1] = sorted_id_size(sorted_ids + mb * BLOCK_M); } }); - // TODO: do we need to vecterize this ? + // TODO: do we need to vectorize this ? for (int mb = 0; mb < num_token_blocks; ++mb) { offsets[mb + 1] += offsets[mb]; } diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index c9f426bdf..98c3ebc5a 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -21,6 +21,12 @@ void onednn_scaled_mm(torch::Tensor& c, const torch::Tensor& a, const std::optional& bias, int64_t handler); +int64_t create_onednn_mm_handler(const torch::Tensor& b, + int64_t primitive_cache_size); + +void onednn_mm(torch::Tensor& c, const torch::Tensor& a, + const std::optional& bias, int64_t handler); + void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query, torch::Tensor& kv_cache, double scale, torch::Tensor& block_tables, torch::Tensor& seq_lens); @@ -153,6 +159,18 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def("release_dnnl_matmul_handler(int handler) -> ()", &release_dnnl_matmul_handler); + // Create oneDNN GEMM handler + ops.def( + "create_onednn_mm_handler(Tensor b, int " + "primitive_cache_size) -> int", + &create_onednn_mm_handler); + + // oneDNN GEMM + ops.def( + "onednn_mm(Tensor! c, Tensor a, Tensor? bias, " + "int handler) -> ()"); + ops.impl("onednn_mm", torch::kCPU, &onednn_mm); + // Create oneDNN W8A8 handler ops.def( "create_onednn_scaled_mm_handler(Tensor b, Tensor b_scales, ScalarType " diff --git a/csrc/custom_all_reduce.cuh b/csrc/custom_all_reduce.cuh index 44709b459..58926f642 100644 --- a/csrc/custom_all_reduce.cuh +++ b/csrc/custom_all_reduce.cuh @@ -15,6 +15,8 @@ typedef __hip_bfloat16 nv_bfloat16; #include #include #include +#include +#include namespace vllm { #define CUDACHECK(cmd) \ @@ -555,22 +557,47 @@ class CustomAllreduce { size /= d; auto bytes = size * sizeof(typename packed_t::P); int blocks = std::min(block_limit, (size + threads - 1) / threads); + + // Check environment variable once + const char* env_algo = std::getenv("VLLM_CUSTOM_ALLREDUCE_ALGO"); + bool force_1stage = false; + bool force_2stage = false; + if (env_algo != nullptr) { + if (std::strcmp(env_algo, "1stage") == 0 || + std::strcmp(env_algo, "oneshot") == 0) { + force_1stage = true; + } else if (std::strcmp(env_algo, "2stage") == 0 || + std::strcmp(env_algo, "twoshot") == 0) { + force_2stage = true; + } else { + throw std::runtime_error( + "Invalid VLLM_CUSTOM_ALLREDUCE_ALGO: " + std::string(env_algo) + + ". Valid values: 1stage, oneshot, 2stage, twoshot"); + } + } + #define KL(ngpus, name) \ name<<>>(ptrs, sg_, self_sg_, output, \ rank_, size); -#define REDUCE_CASE(ngpus) \ - case ngpus: { \ - if (world_size_ == 2) { \ - KL(ngpus, cross_device_reduce_1stage); \ - } else if (fully_connected_) { \ - if ((world_size_ <= 4 && bytes < 512 * 1024) || \ - (world_size_ <= 8 && bytes < 256 * 1024)) { \ - KL(ngpus, cross_device_reduce_1stage); \ - } else { \ - KL(ngpus, cross_device_reduce_2stage); \ - } \ - } \ - break; \ +#define REDUCE_CASE(ngpus) \ + case ngpus: { \ + if (force_1stage) { \ + KL(ngpus, cross_device_reduce_1stage); \ + } else if (force_2stage) { \ + KL(ngpus, cross_device_reduce_2stage); \ + } else { \ + if (world_size_ == 2) { \ + KL(ngpus, cross_device_reduce_1stage); \ + } else if (fully_connected_) { \ + if ((world_size_ <= 4 && bytes < 512 * 1024) || \ + (world_size_ <= 8 && bytes < 256 * 1024)) { \ + KL(ngpus, cross_device_reduce_1stage); \ + } else { \ + KL(ngpus, cross_device_reduce_2stage); \ + } \ + } \ + } \ + break; \ } switch (world_size_) { diff --git a/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp b/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp deleted file mode 100644 index ec75c29e5..000000000 --- a/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp +++ /dev/null @@ -1,123 +0,0 @@ -// Modified from: cutlass/gemm/collective/builders/sm90_gmma_builder.inl -// clang-format off -#pragma once - -#include "cutlass/gemm/collective/builders/sm90_gmma_builder.inl" - -#include "cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp" - - -///////////////////////////////////////////////////////////////////////////////////////////////// - -namespace cutlass::gemm::collective { - -///////////////////////////////////////////////////////////////////////////////////////////////// - -// GMMA_TMA_WS_SS (BlockScaled Builders) -template < - class ElementA, - class GmemLayoutATag, - int AlignmentA, - class ElementB, - class GmemLayoutBTag, - int AlignmentB, - class ElementAccumulator, - class TileShape_MNK, - class ClusterShape_MNK, - class StageCountType, - int ScaleGranularityM -> -struct CollectiveBuilder< - arch::Sm90, - arch::OpClassTensorOp, - ElementA, - GmemLayoutATag, - AlignmentA, - ElementB, - GmemLayoutBTag, - AlignmentB, - ElementAccumulator, - TileShape_MNK, - ClusterShape_MNK, - StageCountType, - KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum, - cute::enable_if_t< - not detail::is_use_rmem_A()> -> { - using KernelScheduleType = KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum; - - static_assert(is_static::value); - static_assert(is_static::value); -#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED - static_assert(cutlass::detail::dependent_false, "Unsupported Toolkit for SM90 Collective Builder\n"); -#endif - static_assert(detail::is_aligned(), - "Should meet TMA alignment requirement\n"); - - static constexpr bool IsArrayOfPointersGemm = (cute::is_any_of_v); - static constexpr bool IsFP8Input = detail::is_input_fp8(); - static_assert((!IsFP8Input || !IsArrayOfPointersGemm), - "KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum is only compatible with FP8 Blocked Scaled version right now."); - - // For fp32 types, map to tf32 MMA value type - using ElementAMma = cute::conditional_t, tfloat32_t, ElementA>; - using ElementBMma = cute::conditional_t, tfloat32_t, ElementB>; - - static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A(); - static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B(); - - static constexpr bool IsCooperative = cute::is_any_of_v>; - using AtomLayoutMNK = cute::conditional_t>, Layout>>; - - using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector< - ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{})); - - using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{}))); - using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{}))); - - using SmemLayoutAtomA = decltype(detail::ss_smem_selector< - GmmaMajorA, ElementAMma, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>()); - using SmemLayoutAtomB = decltype(detail::ss_smem_selector< - GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>()); - - static constexpr size_t TensorMapStorage = IsArrayOfPointersGemm ? sizeof(cute::TmaDescriptor) * 2 /* for A and B */ : 0; - static constexpr int KernelSmemCarveout = static_cast(TensorMapStorage); - - static constexpr int PipelineStages = detail::compute_stage_count_or_override(StageCountType{}); - using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8; - - using SmemCopyAtomA = void; - using SmemCopyAtomB = void; - - using CollectiveOp = CollectiveMma< - DispatchPolicy, - TileShape_MNK, - ElementA, - TagToStrideA_t, - ElementB, - TagToStrideB_t, - TiledMma, - GmemTiledCopyA, - SmemLayoutAtomA, - SmemCopyAtomA, - cute::identity, - GmemTiledCopyB, - SmemLayoutAtomB, - SmemCopyAtomB, - cute::identity - >; -}; - - -///////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace cutlass::gemm::collective - -///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp b/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp deleted file mode 100644 index 13b90e998..000000000 --- a/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp +++ /dev/null @@ -1,183 +0,0 @@ -// clang-format off -// adapted from: https://github.com/soundOfDestiny/cutlass/blob/a4208aa6958864923505cade9c63eb2a6daf16e5/include/cutlass/gemm/collective/fp8_accumulation.hpp - -/*************************************************************************************************** - * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ - -#pragma once - -#include "cute/algorithm/clear.hpp" -#include "cute/tensor.hpp" - -////////////////////////////////////////////////////////////////////////////// -///////////////////////////////////FP8 Accumulation/////////////////////////// -////////////////////////////////////////////////////////////////////////////// -/// This class provides API to promote (add) or scale (multiply_add) the results -/// from the tensor core accumulators to the main accumulators when the number -/// of MMAs reaches the max number of MMA interval specified by user, after that -/// the tensor core accumulators are zeroed. -////////////////////////////////////////////////////////////////////////////// - -namespace cutlass::gemm::collective { - -template < - class EngineAccum, - class LayoutAccum> -struct GmmaFP8AccumulationWithScale { - using TensorAccum = cute::Tensor; - using ElementAccumulator = typename EngineAccum::value_type; - - static_assert(is_static::value, "Accumulator Layout should be static"); - static_assert(is_rmem::value , "Accumulator tensor must be rmem resident."); - -private: - TensorAccum& accum_; - TensorAccum accum_temp_; - - uint32_t accum_promotion_interval_; // defines the max num of executed MMAs after which accum should be promoted. - uint32_t mma_count_per_mainloop_iteration_; // num of MMAs per k_tile of mainloop - uint32_t mma_count_; // current executed MMAs - uint32_t reset_accum_flag_; // accum needs to be zeroed or not. - - // promote or `add` the partial accumulators to main accumulator (FADD). - CUTLASS_DEVICE - void promote_core() { - warpgroup_wait<0>(); - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < size(accum_); ++i) { - accum_(i) += accum_temp_(i); - } - } - - // `multiply` scale the partial accumulators and `add` to main accumulator (FFMA). - template < - class EngineScale, - class LayoutScale> - CUTLASS_DEVICE - void scale_core(const cute::Tensor &scale) { - using TensorScale = cute::Tensor; - - static_assert(is_static::value, "Scale Layout should be static"); - static_assert(is_rmem::value , "Scale tensor must be rmem resident."); - - static_assert(LayoutAccum{}.shape() == LayoutScale{}.shape(), "Accumulator and scale must have same shape."); - - warpgroup_wait<0>(); - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < size(accum_); ++i) { - accum_(i) += accum_temp_(i) * scale(i); - } - } - -public: - CUTLASS_DEVICE - GmmaFP8AccumulationWithScale( - TensorAccum &accum, - uint32_t accum_promotion_interval, - uint32_t mma_count_per_mainloop_iteration) - : accum_(accum), - accum_promotion_interval_(accum_promotion_interval), - mma_count_per_mainloop_iteration_(mma_count_per_mainloop_iteration), - mma_count_(0), - reset_accum_flag_(0) - { - accum_temp_ = cute::make_fragment_like(accum); - } - - // - // Methods (Common) - // - - CUTLASS_DEVICE - TensorAccum& operator()() { - return accum_temp_; - } - - /// prepare the MMA accumulators when initialization or zeroing is required. - CUTLASS_DEVICE - bool prepare_if_needed() { - return reset_accum_flag_; - } - - // - // Methods (for FADD version) - // - - /// promote (add) the results from the MMA accumulators to main accumulator if needed. - CUTLASS_DEVICE - void promote_if_needed() { - mma_count_ += mma_count_per_mainloop_iteration_; - reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0); - if (reset_accum_flag_) { - promote_core(); - mma_count_ = 0; - } - } - - /// promote (add) the residue results from the MMA accumulators to main accumulator if needed. - CUTLASS_DEVICE - void promote_residue_if_needed() { - if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) { - promote_core(); - } - } - - // - // Methods (for FFMA version) - // - - /// scale (multiply_add) the results from the MMA accumulators to main accumulator if needed. - template < - class EngineScale, - class LayoutScale> - CUTLASS_DEVICE - void scale_if_needed(const cute::Tensor &scale) { - mma_count_ += mma_count_per_mainloop_iteration_; - reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0); - if (reset_accum_flag_) { - scale_core(scale); - mma_count_ = 0; - } - } - - /// scale (multiply_add) the residue results from the MMA accumulators to main accumulator if needed. - template < - class EngineScale, - class LayoutScale> - CUTLASS_DEVICE - void scale_residue_if_needed(const cute::Tensor &scale) { - if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) { - scale_core(scale); - } - } -}; - -} // namespace cutlass::gemm::collective diff --git a/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp b/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp deleted file mode 100644 index ce7f47cf7..000000000 --- a/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp +++ /dev/null @@ -1,729 +0,0 @@ -// clang-format off -// Adapted (Heavily) from: https://github.com/soundOfDestiny/cutlass/blob/9d997ce0dea4c5fa1a617db6b7ff29aa9235822c/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp - -/*************************************************************************************************** - * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ - -#pragma once - -#include "cutlass/cutlass.h" -#include "cutlass/gemm/dispatch_policy.hpp" -#include "cutlass/trace.h" -#include "cutlass/numeric_types.h" - -#include "cute/arch/cluster_sm90.hpp" -#include "cute/arch/copy_sm80.hpp" -#include "cute/arch/copy_sm90.hpp" -#include "cute/algorithm/functional.hpp" -#include "cute/atom/mma_atom.hpp" -#include "cute/algorithm/gemm.hpp" -#include "cute/numeric/arithmetic_tuple.hpp" - -#include "cutlass_extensions/gemm/dispatch_policy.hpp" -#include "cutlass_extensions/gemm/collective/fp8_accumulation.hpp" - -///////////////////////////////////////////////////////////////////////////////////////////////// - -namespace cutlass::gemm::collective { -using namespace cute; - -///////////////////////////////////////////////////////////////////////////////////////////////// - -// WarpSpecialized Mainloop -template < - int Stages, - class ClusterShape, - class KernelSchedule, - int ScaleGranularityM_, - class TileShape_, - class ElementA_, - class StrideA_, - class ElementB_, - class StrideB_, - class TiledMma_, - class GmemTiledCopyA_, - class SmemLayoutAtomA_, - class SmemCopyAtomA_, - class TransformA_, - class GmemTiledCopyB_, - class SmemLayoutAtomB_, - class SmemCopyAtomB_, - class TransformB_> -struct CollectiveMma< - MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8, - TileShape_, - ElementA_, - StrideA_, - ElementB_, - StrideB_, - TiledMma_, - GmemTiledCopyA_, - SmemLayoutAtomA_, - SmemCopyAtomA_, - TransformA_, - GmemTiledCopyB_, - SmemLayoutAtomB_, - SmemCopyAtomB_, - TransformB_> -{ - // - // Type Aliases - // - using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8; - using TileShape = TileShape_; - using ElementA = ElementA_; - using StrideA = StrideA_; - using ElementB = ElementB_; - using StrideB = StrideB_; - using TiledMma = TiledMma_; - using ElementAccumulator = typename TiledMma::ValTypeC; - using ElementBlockScale = ElementAccumulator; - using GmemTiledCopyA = GmemTiledCopyA_; - using GmemTiledCopyB = GmemTiledCopyB_; - using SmemLayoutAtomA = SmemLayoutAtomA_; - using SmemLayoutAtomB = SmemLayoutAtomB_; - using SmemCopyAtomA = SmemCopyAtomA_; - using SmemCopyAtomB = SmemCopyAtomB_; - using TransformA = TransformA_; - using TransformB = TransformB_; - using ArchTag = typename DispatchPolicy::ArchTag; - - using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{})); - using MainloopPipeline = cutlass::PipelineTmaAsync; - using PipelineState = cutlass::PipelineState; - using PipelineParams = typename MainloopPipeline::Params; - - // Two threads per CTA are producers (1 for operand tile and 32 for scales) - static constexpr int NumProducerThreadEvents = 33; - - static constexpr int ScaleGranularityM = ScaleGranularityM_ == 0 ? size<0>(TileShape{}) : ScaleGranularityM_; - static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM; - - static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)"); - static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); - static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); - - static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)"); - static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); - static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); - - static_assert((size<0>(TileShape{}) % ScaleGranularityM) == 0, "FP8 scaling granularity must evenly divide tile shape along M."); - - // Tile along modes in a way that maximizes the TMA box size. - using SmemLayoutA = decltype(tile_to_shape( - SmemLayoutAtomA{}, - make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int{}), - cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{})); - using SmemLayoutB = decltype(tile_to_shape( - SmemLayoutAtomB{}, - make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int{}), - cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{})); - - // Block scaling gmem-to-smem copy atom - using SmemBlockScalingCopyAtomA = Copy_Atom, ElementBlockScale>; - using SmemBlockScalingCopyAtomB = Copy_Atom, ElementBlockScale>; - - // Block scaling smem layout - using SmemLayoutScaleA = Layout, Int>>; - using SmemLayoutScaleB = Layout>, Stride<_1>>; // `ScaleNsPerTile` is always 1. - - static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more."); - static_assert(cute::is_base_of::value && - cute::is_base_of::value, - "MMA atom must source both A and B operand from smem_desc for this mainloop."); - static_assert(cute::is_same_v || cute::is_same_v, - "GmemTiledCopy - invalid SM90 TMA copy atom specified."); - static_assert(cute::is_same_v || cute::is_same_v, - "GmemTiledCopy - invalid SM90 TMA copy atom specified."); - static_assert(cute::is_same_v, - "ElementAccumulator and ElementBlockScale should be same datatype"); - - struct SharedStorage - { - struct TensorStorage : cute::aligned_struct<128> { - cute::array_aligned> smem_A; // mxk - cute::array_aligned> smem_B; // nxk - cute::array_aligned> smem_scale_A; // ScaleMsPerTile x k - cute::array_aligned> smem_scale_B; // 1xk - } tensors; - - using PipelineStorage = typename MainloopPipeline::SharedStorage; - PipelineStorage pipeline; - }; - using TensorStorage = typename SharedStorage::TensorStorage; - using PipelineStorage = typename SharedStorage::PipelineStorage; - - // Host side kernel arguments - struct Arguments { - ElementA const* ptr_A; - StrideA dA; - ElementB const* ptr_B; - StrideB dB; - ElementBlockScale const* ptr_scale_A; - ElementBlockScale const* ptr_scale_B; - }; - - // Device side kernel params - struct Params { - // Assumption: StrideA is congruent with Problem_MK - using TMA_A = decltype(make_tma_copy_A_sm90( - GmemTiledCopyA{}, - make_tensor(static_cast(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}), - SmemLayoutA{}(_,_,0), - TileShape{}, - ClusterShape{})); - // Assumption: StrideB is congruent with Problem_NK - using TMA_B = decltype(make_tma_copy_B_sm90( - GmemTiledCopyB{}, - make_tensor(static_cast(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}), - SmemLayoutB{}(_,_,0), - TileShape{}, - ClusterShape{})); - TMA_A tma_load_a; - TMA_B tma_load_b; - uint32_t tma_transaction_bytes = TmaTransactionBytes; - uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK; - uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK; - // Block scaling factors for A and B - ElementBlockScale const* ptr_scale_A; - ElementBlockScale const* ptr_scale_B; - }; - - // - // Methods - // - - template - static constexpr Params - to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) { - (void) workspace; - - // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK) - auto problem_shape_MNKL = append<4>(problem_shape, 1); - auto [M,N,K,L] = problem_shape_MNKL; - - auto ptr_A = reinterpret_cast(args.ptr_A); - auto ptr_B = reinterpret_cast(args.ptr_B); - - Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA)); - Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB)); - typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90( - GmemTiledCopyA{}, - tensor_a, - SmemLayoutA{}(_,_,cute::Int<0>{}), - TileShape{}, - ClusterShape{}); - typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90( - GmemTiledCopyB{}, - tensor_b, - SmemLayoutB{}(_,_,cute::Int<0>{}), - TileShape{}, - ClusterShape{}); - uint32_t transaction_bytes_mk = TmaTransactionBytesMK; - uint32_t transaction_bytes_nk = TmaTransactionBytesNK; - uint32_t transaction_bytes = transaction_bytes_mk + transaction_bytes_nk; - - return { - tma_load_a, - tma_load_b, - transaction_bytes, - transaction_bytes_mk, - transaction_bytes_nk, - args.ptr_scale_A, - args.ptr_scale_B - }; - } - - template - static bool - can_implement( - ProblemShape const& problem_shape, - [[maybe_unused]] Arguments const& args) { - constexpr int tma_alignment_bits = 128; - auto problem_shape_MNKL = append<4>(problem_shape, 1); - auto [M,N,K,L] = problem_shape_MNKL; - - bool implementable = true; - constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits::value; - implementable = implementable && cutlass::detail::check_alignment(cute::make_shape(M,K,L), StrideA{}); - constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits::value; - implementable = implementable && cutlass::detail::check_alignment(cute::make_shape(N,K,L), StrideB{}); - - if (!implementable) { - CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n"); - } - return implementable; - } - - static constexpr int K_PIPE_MAX = DispatchPolicy::Stages; - static constexpr int K_PIPE_MMAS = 1; - static constexpr uint32_t TmaTransactionBytesMK = - cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast(sizeof_bits::value)); - static constexpr uint32_t TmaTransactionBytesNK = - cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast(sizeof_bits::value)); - static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK; - - /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance - CUTLASS_DEVICE - static void prefetch_tma_descriptors(Params const& mainloop_params) - { - cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor()); - cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor()); - } - - /// Set up the data needed by this collective for load and mma. - /// Returns a tuple of tensors. The collective and the kernel layer have the contract - /// Returned tuple must contain at least two elements, with the first two elements being: - /// gA_mkl - The tma tensor, A after a local tile so it has shape (BLK_M,BLK_K,m,k,l) - /// gB_nkl - The tma tensor, B after a local tile so it has shape (BLK_N,BLK_K,n,k,l) - template - CUTLASS_DEVICE auto - load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const { - using X = Underscore; - // Separate out problem shape for convenience - auto [M,N,K,L] = problem_shape_MNKL; - - // TMA requires special handling of strides to deal with coord codomain mapping - // Represent the full tensors -- get these from TMA - Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L)); // (m,k,l) - Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L)); // (n,k,l) - - // Make tiled views, defer the slice - Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{}); // (BLK_M,BLK_K,m,k,l) - Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{}); // (BLK_N,BLK_K,n,k,l) - - constexpr auto scales_m = Int{}; - auto tM = get<2>(gA_mkl.shape()); - auto tN = get<2>(gB_nkl.shape()); - auto tK = get<3>(gA_mkl.shape()); - - // Make the tiled views of scale tensors - auto scaleA_shape = make_shape(M / ScaleGranularityM, tK, L); // (scale_m,k,l) - auto scaleA_layout = make_ordered_layout(scaleA_shape, Step<_0, _1, _2>{}); - auto scaleB_shape = make_shape(tN, tK, L); // (n,k,l) - auto scaleB_layout = make_ordered_layout(scaleB_shape, Step<_1, _0, _2>{}); - - // Note that mScaleA_mkl and mScaleB_nkl are already blocked tiled in the `m` host and - // gScaleA_mkl and gScaleB_nkl in `g` global memory are same as mScaleA_mkl and mScaleB_nkl. - Tensor mScaleA_mkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_A), scaleA_layout); // (scale_m,k,l) - Tensor mScaleB_nkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_B), scaleB_layout); // (n,k,l) - - return cute::make_tuple(gA_mkl, gB_nkl, mScaleA_mkl, mScaleB_nkl); - } - - /// Perform a collective-scoped matrix multiply-accumulate - /// Producer Perspective - template < - class TensorA, class TensorB, - class TensorScaleA, class TensorScaleB, - class KTileIterator, class BlockCoord - > - CUTLASS_DEVICE void - load( - Params const& mainloop_params, - MainloopPipeline pipeline, - PipelineState smem_pipe_write, - cute::tuple const& load_inputs, - BlockCoord const& blk_coord, - KTileIterator k_tile_iter, int k_tile_count, - int thread_idx, - uint32_t block_rank_in_cluster, - TensorStorage& shared_tensors) { - int lane_predicate = cute::elect_one_sync(); - - // Blockscaling: Tma loads for load_input and CpAsync for load_scale - Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE) - Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE) - Tensor sScaleA = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()), SmemLayoutScaleA{}); // (ScaleMsPerTile,k) - Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (k) - - // - // Prepare the TMA loads for A and B - // - - constexpr uint32_t cluster_shape_x = get<0>(ClusterShape()); - uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x}; - - Tensor gA_mkl = get<0>(load_inputs); - Tensor gB_nkl = get<1>(load_inputs); - - auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y); - auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x); - - // Partition the inputs based on the current block coordinates. - auto [m_coord, n_coord, k_coord, l_coord] = blk_coord; - Tensor gA = gA_mkl(_,_,m_coord,_,l_coord); // (BLK_M,BLK_K,k) - Tensor gB = gB_nkl(_,_,n_coord,_,l_coord); // (BLK_N,BLK_K,k) - - - // Block scaling: load_scale has scaling tensors in global memory which are not tiled - Tensor mScaleA_mkl = get<2>(load_inputs); - Tensor mScaleB_nkl = get<3>(load_inputs); - auto scales_m = get<0>(mScaleA_mkl.shape()); - - Tensor cScaleA_mkl = make_identity_tensor(mScaleA_mkl.shape()); - - Tensor gScaleA = local_tile( - mScaleA_mkl, make_tile(Int{}), - make_coord(m_coord,_,l_coord)); // (ScaleMsPerTile,k,1) - Tensor cScaleA = local_tile( - cScaleA_mkl, make_tile(Int{}), - make_coord(m_coord,_,l_coord)); - Tensor gScaleB = mScaleB_nkl(n_coord,_,l_coord); // (1,k,1) - - // TODO: test `scale_copy_a` with `ScaleMsPerTile` < 128 - TiledCopy scale_copy_a = make_tiled_copy(SmemBlockScalingCopyAtomA{}, - Layout>{}, Layout>{}); // (1,1,1) - TiledCopy scale_copy_b = make_tiled_copy(SmemBlockScalingCopyAtomB{}, - Layout>{}, Layout>{}); // (1,1,1) - ThrCopy thr_scale_copy_a = scale_copy_a.get_slice(threadIdx.x); - ThrCopy thr_scale_copy_b = scale_copy_b.get_slice(threadIdx.x); - - Tensor tAgA_ScaleA = thr_scale_copy_a.partition_S(gScaleA); - Tensor tAcA_ScaleA = thr_scale_copy_a.partition_S(cScaleA); - Tensor tAsA_ScaleA = thr_scale_copy_a.partition_D(sScaleA); - - Tensor tBgB_ScaleB = thr_scale_copy_b.partition_S(gScaleB); - Tensor tBsB_ScaleB = thr_scale_copy_b.partition_D(sScaleB); - - // Applies the mapping from block_tma_a - Tensor tAgA = block_tma_a.partition_S(gA); // (TMA,TMA_M,TMA_K,k) - Tensor tAsA = block_tma_a.partition_D(sA); // (TMA,TMA_M,TMA_K,PIPE) - - Tensor tBgB = block_tma_b.partition_S(gB); // (TMA,TMA_N,TMA_K,k) - Tensor tBsB = block_tma_b.partition_D(sB); // (TMA,TMA_N,TMA_K,PIPE) - - uint16_t mcast_mask_a = 0; - uint16_t mcast_mask_b = 0; - - // Issue TmaLoads for GEMM operands A/B and CpAsync for scale tensors - // Maps the tile -> block, value - if constexpr (cute::is_same_v) { - auto block_layout = Layout{}; // (m,n) -> block_id - for (int n = 0; n < size<1>(block_layout); ++n) { - mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{})); - } - } - - if constexpr (cute::is_same_v) { - auto block_layout = Layout{}; // (m,n) -> block_id - for (int m = 0; m < size<0>(block_layout); ++m) { - mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{})); - } - } - - // Allocate predicate tensors for a_scales (since we can't guarantee that - // all scales are valid, since we could have a partial tiles along M) - Tensor tApA_ScaleA = make_tensor(shape(tAsA_ScaleA(_,_,0))); - #pragma unroll - for (int i = 0; i < size(tApA_ScaleA); ++i) { - tApA_ScaleA(i) = get<0>(tAcA_ScaleA(i)) < scales_m; - } - - // Mainloop - CUTLASS_PRAGMA_NO_UNROLL - for ( ; k_tile_count > 0; --k_tile_count) { - // LOCK smem_pipe_write for _writing_ - pipeline.producer_acquire(smem_pipe_write); - - // - // Copy gmem to smem for *k_tile_iter - // - int write_stage = smem_pipe_write.index(); - using BarrierType = typename MainloopPipeline::ProducerBarrierType; - BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write); - - // Copy operands A and B from global memory to shared memory - if (lane_predicate) copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage)); - if (lane_predicate) copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage)); - - // Copy scale tensors from global memory to shared memory - copy_if(scale_copy_a, tApA_ScaleA, tAgA_ScaleA(_,_,*k_tile_iter), tAsA_ScaleA(_,_,write_stage)); - copy(scale_copy_b, tBgB_ScaleB(_,*k_tile_iter), tBsB_ScaleB(_,write_stage)); - pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive_noinc); - - ++k_tile_iter; - - // Advance smem_pipe_write - ++smem_pipe_write; - } - } - - /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster - CUTLASS_DEVICE void - load_tail( - MainloopPipeline pipeline, - PipelineState smem_pipe_write) { - int lane_predicate = cute::elect_one_sync(); - - // Issue the epilogue waits - if (lane_predicate) { - /* This helps avoid early exit of blocks in Cluster - * Waits for all stages to either be released (all - * Consumer UNLOCKs), or if the stage was never used - * then would just be acquired since the phase was - * still inverted from make_producer_start_state - */ - pipeline.producer_tail(smem_pipe_write); - } - } - - /// Perform a collective-scoped matrix multiply-accumulate - /// Consumer Perspective - template < - class FrgTensorC - > - CUTLASS_DEVICE void - mma(MainloopPipeline pipeline, - PipelineState smem_pipe_read, - FrgTensorC& accum, - int k_tile_count, - int thread_idx, - TensorStorage& shared_tensors, - Params const& mainloop_params) { - - - static_assert(is_rmem::value, "C tensor must be rmem resident."); - static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3."); - static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3."); - static_assert(cute::is_void_v, - "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions."); - static_assert(cute::is_void_v, - "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions."); - - Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE) - Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE) - - // Block scaling - Tensor sScaleAViewAsC = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()), - Layout< - Shape, Int>, cute::tuple_element_t<1, TileShape>, Int>, - Stride, _0, Int> - >{}); // ((ScaleGranularityM,ScaleMsPerTile),n,k) - Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (k) - - // - // Define C accumulators and A/B partitioning - // - - // Layout of warp group to thread mapping - - static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and - stride<0>(typename TiledMma::BLayout{}) == 0 and - size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and - size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, - "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup"); - - constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup; - Layout warp_group_thread_layout = make_layout(Int{}, - Int{}); - - int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0); - - TiledMma tiled_mma; - auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx)); - - Tensor tCsScaleAViewAsC = tiled_mma.get_slice(thread_idx).partition_C(sScaleAViewAsC); // (MMA,MMA_M,MMA_N,PIPE), `thread_mma` above is correct when partitioning A and B, but it is not correct when partitioning C. - - Tensor tCsA = thread_mma.partition_A(sA); // (MMA,MMA_M,MMA_K,PIPE) - Tensor tCsB = thread_mma.partition_B(sB); // (MMA,MMA_N,MMA_K,PIPE) - - // Allocate "fragments/descriptors" - Tensor tCrA = thread_mma.make_fragment_A(tCsA); // (MMA,MMA_M,MMA_K,PIPE) - Tensor tCrB = thread_mma.make_fragment_B(tCsB); // (MMA,MMA_N,MMA_K,PIPE) - - CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum)); // M - CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum)); // N - CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB)); // K - CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB)); // PIPE - CUTE_STATIC_ASSERT_V(Int{} == size<2>(sA)); // PIPE - CUTE_STATIC_ASSERT_V(Int{} == size<2>(sB)); // PIPE - - // - // PIPELINED MAIN LOOP - // - static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS < K_PIPE_MAX), - "ERROR : Incorrect number of MMAs in flight"); - - // We release buffers to producer warps(dma load) with some mmas in flight - PipelineState smem_pipe_release = smem_pipe_read; - - // Per block scale values for operand A and B - - using RegLayoutScaleAViewAsC = decltype(make_layout_like(tCsScaleAViewAsC(_, _, _, 0).layout())); // `make_layout_like` makes a compact layout. - using RegLayoutScaleAEssential = decltype(filter_zeros(RegLayoutScaleAViewAsC{}.stride(), RegLayoutScaleAViewAsC{}.shape())); // an interface to traverse the underlying storage for the compact layout mentioned above - - Tensor tCrScaleAViewAsC = make_tensor(RegLayoutScaleAViewAsC{}); // (MMA,MMA_M,MMA_N) - ElementBlockScale scale_b; - - // Prologue GMMAs - int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count); - - tiled_mma.accumulate_ = GMMA::ScaleOut::Zero; - - GmmaFP8AccumulationWithScale accumulation(accum, size<2>(TileShape{}) / size<2>(typename TiledMma::AtomShape_MNK{}), size<2>(tCrA)); - warpgroup_fence_operand(accumulation()); - CUTLASS_PRAGMA_UNROLL - for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue) - { - // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value) - auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read); - pipeline.consumer_wait(smem_pipe_read, barrier_token); - - if (accumulation.prepare_if_needed()) { - tiled_mma.accumulate_ = GMMA::ScaleOut::Zero; - } - - int read_stage = smem_pipe_read.index(); - - // Load per block scale values from shared memory to registers. - scale_b = sScaleB[read_stage]; - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) { - tCrScaleAViewAsC.data()[i] = tCsScaleAViewAsC(_, _, _, read_stage)(idx2crd(i, RegLayoutScaleAEssential{})); - } - if constexpr (ScaleMsPerTile == 1) { - static_assert(size(RegLayoutScaleAEssential{}) == 1); - tCrScaleAViewAsC.data()[0] = __shfl_sync(0xffffffff, tCrScaleAViewAsC.data()[0] * scale_b, 0); // `tCrScaleAViewAsC.data()[0]` are all same in a warp group when `ScaleMsPerTile == 1`. - } else { - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) { - tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b; - } - } - - warpgroup_arrive(); - // Unroll the K mode manually to set scale D to 1 - CUTLASS_PRAGMA_UNROLL - for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) { - // (V,M,K) x (V,N,K) => (V,M,N) - cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation()); - tiled_mma.accumulate_ = GMMA::ScaleOut::One; - } - warpgroup_commit_batch(); - - // Block scale the accumulators with reg tensor `tCrScaleAViewAsC` - accumulation.scale_if_needed(tCrScaleAViewAsC); - - ++smem_pipe_read; - } - - warpgroup_fence_operand(accumulation()); - // Mainloop GMMAs - k_tile_count -= prologue_mma_count; - - CUTLASS_PRAGMA_NO_UNROLL - for ( ; k_tile_count > 0; --k_tile_count) - { - // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value) - auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read); - pipeline.consumer_wait(smem_pipe_read, barrier_token); - - // - // Compute on k_tile - // - - int read_stage = smem_pipe_read.index(); - - // Load per block scale values from shared memory to registers (at most twice per block along M and exactly once per block along N) - scale_b = sScaleB[read_stage]; - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) { - tCrScaleAViewAsC.data()[i] = tCsScaleAViewAsC(_, _, _, read_stage)(idx2crd(i, RegLayoutScaleAEssential{})); - } - if constexpr (ScaleMsPerTile == 1) { - static_assert(size(RegLayoutScaleAEssential{}) == 1); - tCrScaleAViewAsC.data()[0] = __shfl_sync(0xffffffff, tCrScaleAViewAsC.data()[0] * scale_b, 0); // `tCrScaleAViewAsC.data()[0]` are all same in a warp group when `ScaleMsPerTile == 1`. - } else { - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) { - tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b; - } - } - - if (accumulation.prepare_if_needed()) { - tiled_mma.accumulate_ = GMMA::ScaleOut::Zero; - } - - warpgroup_fence_operand(accumulation()); - warpgroup_arrive(); - // Unroll the K mode manually to set scale D to 1 - CUTLASS_PRAGMA_UNROLL - for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) { - // (V,M,K) x (V,N,K) => (V,M,N) - cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation()); - tiled_mma.accumulate_ = GMMA::ScaleOut::One; - } - warpgroup_commit_batch(); - - /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed - warpgroup_wait(); - warpgroup_fence_operand(accumulation()); - - // Block scale the accumulators with reg tensor `tCrScaleAViewAsC` - accumulation.scale_if_needed(tCrScaleAViewAsC); - - pipeline.consumer_release(smem_pipe_release); // UNLOCK smem_pipe_release, done _computing_ on it - - // Advance smem_pipe_read and smem_pipe_release - ++smem_pipe_read; - ++smem_pipe_release; - } - - accumulation.scale_residue_if_needed(tCrScaleAViewAsC); - - warpgroup_fence_operand(accumulation()); - } - - /// Perform a Consumer Epilogue to release all buffers - CUTLASS_DEVICE void - mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) { - // Prologue GMMAs - int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count); - k_tile_count -= prologue_mma_count; - - smem_pipe_release.advance(k_tile_count); - - // Wait on all GMMAs to complete - warpgroup_wait<0>(); - - for (int count = 0; count < prologue_mma_count; ++count) { - pipeline.consumer_release(smem_pipe_release); // UNLOCK smem_pipe_release, done _computing_ on it - ++smem_pipe_release; - } - } -}; - -///////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace cutlass::gemm::collective - -///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/csrc/cutlass_extensions/gemm/dispatch_policy.hpp b/csrc/cutlass_extensions/gemm/dispatch_policy.hpp deleted file mode 100644 index df809e27a..000000000 --- a/csrc/cutlass_extensions/gemm/dispatch_policy.hpp +++ /dev/null @@ -1,39 +0,0 @@ -#pragma once - -#include "cutlass/gemm/dispatch_policy.hpp" - -namespace cutlass::gemm { - -////////////////////////////////////////////////////////////////////////////// - -// FP8 related policies (including Blocked Scaled Accumulation) -// `ScaleGranularityM` specifies scaling granularity along M, while zero-value -// `ScaleGranularityM` indicates that scaling granularity is -// `size<0>(TileShape_MNK{})` along M. -template -struct KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum - : KernelTmaWarpSpecializedCooperative {}; - -// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp -// specialized dynamic schedule For FP8 kernels with Block Scaling -template , - class KernelSchedule = KernelTmaWarpSpecialized, - int ScaleGranularityM = - 0 // `ScaleGranularityM` specifies scaling granularity along M, - // while zero-value `ScaleGranularityM` indicates that scaling - // granularity is `size<0>(TileShape_MNK{})` along M. - > -struct MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8 - : MainloopSm90TmaGmmaWarpSpecialized { - static_assert( - cute::is_same_v< - KernelSchedule, - KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum< - ScaleGranularityM>>, - "KernelSchedule must be one of the warp specialized policies"); -}; - -////////////////////////////////////////////////////////////////////////////// - -} // namespace cutlass::gemm \ No newline at end of file diff --git a/csrc/cutlass_extensions/vllm_collective_builder.cuh b/csrc/cutlass_extensions/vllm_collective_builder.cuh index e7fbba4cd..085ee1290 100644 --- a/csrc/cutlass_extensions/vllm_collective_builder.cuh +++ b/csrc/cutlass_extensions/vllm_collective_builder.cuh @@ -1,6 +1,6 @@ #pragma once -#include "cutlass_extensions/gemm/collective/collective_builder.hpp" +#include "cutlass/gemm/collective/collective_builder.hpp" namespace cutlass::gemm::collective { using namespace cute; diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h index f7b75c483..995374a50 100644 --- a/csrc/dispatch_utils.h +++ b/csrc/dispatch_utils.h @@ -19,6 +19,13 @@ #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) +#define VLLM_DISPATCH_CASE_HALF_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) + +#define VLLM_DISPATCH_HALF_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_HALF_TYPES(__VA_ARGS__)) + // ROCm devices might use either fn or fnuz, so set up dispatch table for both. // A host-based check at runtime will create a preferred FP8 type for ROCm // such that the correct kernel is dispatched. diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu index f051eb070..05be023de 100644 --- a/csrc/layernorm_kernels.cu +++ b/csrc/layernorm_kernels.cu @@ -140,6 +140,211 @@ fused_add_rms_norm_kernel( } } +/* Function specialization in the case of FP16/BF16 tensors. + Additional optimizations we can make in this case are + packed and vectorized operations, which help with the + memory latency bottleneck. + + _f16VecPN struct extends _f16Vec to add operations specifically required for + polynomial normalization (poly norm). + The original _f16Vec does not include the sum-of-powers computation or + in-place polynomial normalization logic. */ +template +struct alignas(16) _f16VecPN : _f16Vec { + using Base = _f16Vec; + using Converter = typename Base::Converter; + using T1 = typename Base::T1; + using T2 = typename Base::T2; + using Base::data; + + __device__ auto sum_pows() const { + float s2 = 0.0f, s4 = 0.0f, s6 = 0.0f; + +#pragma unroll + for (int i = 0; i < width; i += 2) { + float2 z = Converter::convert(T2{data[i], data[i + 1]}); + float x2 = z.x * z.x; + float x4 = x2 * x2; + float x6 = x4 * x2; + + float y2 = z.y * z.y; + float y4 = y2 * y2; + float y6 = y4 * y2; + + s2 += x2 + y2; + s4 += x4 + y4; + s6 += x6 + y6; + } + return std::make_tuple(s2, s4, s6); + } + + __device__ void poly_norm_inplace(const float w2_inv_std, + const float w1_inv_std2, + const float w0_inv_std3, const float bias) { +#pragma unroll + for (int i = 0; i < width; i += 2) { + float2 z = Converter::convert(T2{data[i], data[i + 1]}); + + float x2 = z.x * z.x; + float x3 = x2 * z.x; + z.x = w2_inv_std * z.x + w1_inv_std2 * x2 + w0_inv_std3 * x3 + bias; + + float y2 = z.y * z.y; + float y3 = y2 * z.y; + z.y = w2_inv_std * z.y + w1_inv_std2 * y2 + w0_inv_std3 * y3 + bias; + + auto out = Converter::convert(z); + data[i] = out.x; + data[i + 1] = out.y; + } + } +}; + +template +__global__ std::enable_if_t<(width > 0) && _typeConvert::exists> +poly_norm_kernel(scalar_t* __restrict__ out, // [..., hidden_size] + const scalar_t* __restrict__ input, // [..., hidden_size] + const scalar_t* __restrict__ weight, // [3] + const scalar_t* __restrict__ bias, // [1] + const float epsilon, const int hidden_size) { + // Sanity checks on our vector struct and type-punned pointer arithmetic + static_assert(std::is_pod_v<_f16VecPN>); + static_assert(sizeof(_f16VecPN) == sizeof(scalar_t) * width); + + /* These and the argument pointers are all declared `restrict` as they are + not aliased in practice. Argument pointers should not be dereferenced + in this kernel as that would be undefined behavior */ + auto* __restrict__ input_v = + reinterpret_cast*>(input); + const int vec_hidden_size = hidden_size / width; + float variance = 0.0f; + float variance2 = 0.0f; + float variance3 = 0.0f; + + for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) { + int id = blockIdx.x * vec_hidden_size + idx; + _f16VecPN temp = input_v[id]; + auto [x2, x4, x6] = temp.sum_pows(); + + variance += x2; + variance2 += x4; + variance3 += x6; + } + + float3 thread_variances = make_float3(variance, variance2, variance3); + + struct SumOp { + __device__ float3 operator()(const float3& a, const float3& b) const { + return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); + } + }; + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage reduceStore; + float3 block_variances = + BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x); + + variance = block_variances.x; + variance2 = block_variances.y; + variance3 = block_variances.z; + + __shared__ float s_w2_inv_std; + __shared__ float s_w1_inv_std2; + __shared__ float s_w0_inv_std3; + __shared__ float s_bias; + + if (threadIdx.x == 0) { + float w0 = (float)weight[0]; + float w1 = (float)weight[1]; + float w2 = (float)weight[2]; + s_bias = (float)bias[0]; + + s_w2_inv_std = w2 * rsqrtf(variance / hidden_size + epsilon); + s_w1_inv_std2 = w1 * rsqrtf(variance2 / hidden_size + epsilon); + s_w0_inv_std3 = w0 * rsqrtf(variance3 / hidden_size + epsilon); + } + __syncthreads(); + + auto* __restrict__ out_v = reinterpret_cast<_f16VecPN*>(out); + + for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) { + int id = blockIdx.x * vec_hidden_size + idx; + _f16VecPN temp = input_v[id]; + temp.poly_norm_inplace(s_w2_inv_std, s_w1_inv_std2, s_w0_inv_std3, s_bias); + out_v[id] = temp; + } +} + +/* Generic poly_norm_kernel + The width field is not used here but necessary for other specializations. + */ +template +__global__ std::enable_if_t<(width == 0) || !_typeConvert::exists> +poly_norm_kernel(scalar_t* __restrict__ out, // [..., hidden_size] + const scalar_t* __restrict__ input, // [..., hidden_size] + const scalar_t* __restrict__ weight, // [3] + const scalar_t* __restrict__ bias, // [1] + const float epsilon, const int hidden_size) { + float variance = 0.0f; + float variance2 = 0.0f; + float variance3 = 0.0f; + + for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { + float x = (float)input[blockIdx.x * hidden_size + idx]; + float x2 = x * x; + float x4 = x2 * x2; + float x6 = x4 * x2; + + variance += x2; + variance2 += x4; + variance3 += x6; + } + + float3 thread_variances = make_float3(variance, variance2, variance3); + + struct SumOp { + __device__ float3 operator()(const float3& a, const float3& b) const { + return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); + } + }; + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage reduceStore; + float3 block_variances = + BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x); + + variance = block_variances.x; + variance2 = block_variances.y; + variance3 = block_variances.z; + + __shared__ float s_w2_inv_std; + __shared__ float s_w1_inv_std2; + __shared__ float s_w0_inv_std3; + __shared__ float s_bias; + + if (threadIdx.x == 0) { + float w0 = (float)weight[0]; + float w1 = (float)weight[1]; + float w2 = (float)weight[2]; + s_bias = (float)bias[0]; + + s_w2_inv_std = w2 * rsqrtf(variance / hidden_size + epsilon); + s_w1_inv_std2 = w1 * rsqrtf(variance2 / hidden_size + epsilon); + s_w0_inv_std3 = w0 * rsqrtf(variance3 / hidden_size + epsilon); + } + __syncthreads(); + + for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { + float x = (float)input[blockIdx.x * hidden_size + idx]; + float x2 = x * x; + float x3 = x2 * x; + + out[blockIdx.x * hidden_size + idx] = + (scalar_t)(x * s_w2_inv_std + x2 * s_w1_inv_std2 + x3 * s_w0_inv_std3 + + s_bias); + } +} + } // namespace vllm void rms_norm(torch::Tensor& out, // [..., hidden_size] @@ -219,3 +424,49 @@ void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size] LAUNCH_FUSED_ADD_RMS_NORM(0); } } + +#define LAUNCH_FUSED_POLY_NORM(width) \ + VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "poly_norm_kernel", [&] { \ + vllm::poly_norm_kernel<<>>( \ + out.data_ptr(), input.data_ptr(), \ + weight.data_ptr(), bias.data_ptr(), epsilon, \ + hidden_size); \ + }); + +void poly_norm(torch::Tensor& out, // [..., hidden_size] + torch::Tensor& input, // [..., hidden_size] + torch::Tensor& weight, // [3] + torch::Tensor& bias, // [1] + double epsilon) { + TORCH_CHECK(out.is_contiguous()); + TORCH_CHECK(input.is_contiguous()); + TORCH_CHECK(out.data_ptr() != input.data_ptr()); + + int hidden_size = input.size(-1); + int num_tokens = input.numel() / hidden_size; + + dim3 grid(num_tokens); + /* This kernel is memory-latency bound in many scenarios. + When num_tokens is large, a smaller block size allows + for increased block occupancy on CUs and better latency + hiding on global mem ops. */ + const int max_block_size = (num_tokens < 256) ? 1024 : 256; + dim3 block(std::min(hidden_size, max_block_size)); + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + /*If the tensor types are FP16/BF16, try to use the optimized kernel + with packed + vectorized ops. + Max optimization is achieved with a width-8 vector of FP16/BF16s + since we can load at most 128 bits at once in a global memory op. + However, this requires each tensor's data to be aligned to 16 + bytes. + */ + auto inp_ptr = reinterpret_cast(input.data_ptr()); + auto out_ptr = reinterpret_cast(out.data_ptr()); + bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0; + if (ptrs_are_aligned && hidden_size % 8 == 0) { + LAUNCH_FUSED_POLY_NORM(8); + } else { + LAUNCH_FUSED_POLY_NORM(0); + } +} diff --git a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu index c4ddbc142..d534e138d 100644 --- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu +++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu @@ -27,11 +27,12 @@ template + bool kHasZ_, bool kVarlen_, typename input_t_, typename weight_t_, typename state_t_> struct Selective_Scan_fwd_kernel_traits { static_assert(kNItems_ % 4 == 0); using input_t = input_t_; using weight_t = weight_t_; + using state_t = state_t_; static constexpr int kNThreads = kNThreads_; // Setting MinBlocksPerMP to be 3 (instead of 2) for 128 threads improves occupancy. static constexpr int kMinBlocks = kNThreads < 128 ? 5 : 3; @@ -132,7 +133,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) { input_t *Bvar = reinterpret_cast(params.B_ptr) + sequence_start_index * params.B_batch_stride + group_id * params.B_group_stride; weight_t *C = reinterpret_cast(params.C_ptr) + dim_id * kNRows * params.C_d_stride; input_t *Cvar = reinterpret_cast(params.C_ptr) + sequence_start_index * params.C_batch_stride + group_id * params.C_group_stride; - input_t *ssm_states = reinterpret_cast(params.ssm_states_ptr) + + typename Ktraits::state_t *ssm_states = reinterpret_cast(params.ssm_states_ptr) + cache_index * params.ssm_states_batch_stride + dim_id * kNRows * params.ssm_states_dim_stride; @@ -261,7 +262,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) { if (threadIdx.x == 0) { smem_running_prefix[state_idx] = prefix_op.running_prefix; if (chunk == n_chunks - 1) { - ssm_states[state_idx * params.ssm_states_dstate_stride] = input_t(prefix_op.running_prefix.y); + ssm_states[state_idx * params.ssm_states_dstate_stride] = typename Ktraits::state_t(prefix_op.running_prefix.y); } } #pragma unroll @@ -310,7 +311,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) { } } -template +template void selective_scan_fwd_launch(SSMParamsBase ¶ms, cudaStream_t stream) { // Only kNRows == 1 is tested for now, which ofc doesn't differ from previously when we had each block // processing 1 row. @@ -321,7 +322,7 @@ void selective_scan_fwd_launch(SSMParamsBase ¶ms, cudaStream_t stream) { BOOL_SWITCH(params.seqlen % (kNThreads * kNItems) == 0, kIsEvenLen, [&] { BOOL_SWITCH(params.z_ptr != nullptr , kHasZ, [&] { BOOL_SWITCH(params.query_start_loc_ptr != nullptr , kVarlen, [&] { - using Ktraits = Selective_Scan_fwd_kernel_traits; + using Ktraits = Selective_Scan_fwd_kernel_traits; constexpr int kSmemSize = Ktraits::kSmemSize + kNRows * MAX_DSTATE * sizeof(typename Ktraits::scan_t); dim3 grid(params.batch, params.dim / kNRows); auto kernel = &selective_scan_fwd_kernel; @@ -341,59 +342,78 @@ void selective_scan_fwd_launch(SSMParamsBase ¶ms, cudaStream_t stream) { }); } -template +template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream) { #ifndef USE_ROCM if (params.seqlen <= 128) { - selective_scan_fwd_launch<32, 4, input_t, weight_t>(params, stream); + selective_scan_fwd_launch<32, 4, input_t, weight_t, state_t>(params, stream); } else if (params.seqlen <= 256) { - selective_scan_fwd_launch<32, 8, input_t, weight_t>(params, stream); + selective_scan_fwd_launch<32, 8, input_t, weight_t, state_t>(params, stream); } else if (params.seqlen <= 512) { - selective_scan_fwd_launch<32, 16, input_t, weight_t>(params, stream); + selective_scan_fwd_launch<32, 16, input_t, weight_t, state_t>(params, stream); } else if (params.seqlen <= 1024) { - selective_scan_fwd_launch<64, 16, input_t, weight_t>(params, stream); + selective_scan_fwd_launch<64, 16, input_t, weight_t, state_t>(params, stream); } else { - selective_scan_fwd_launch<128, 16, input_t, weight_t>(params, stream); + selective_scan_fwd_launch<128, 16, input_t, weight_t, state_t>(params, stream); } #else if (params.seqlen <= 256) { - selective_scan_fwd_launch<64, 4, input_t, weight_t>(params, stream); + selective_scan_fwd_launch<64, 4, input_t, weight_t, state_t>(params, stream); } else if (params.seqlen <= 512) { - selective_scan_fwd_launch<64, 8, input_t, weight_t>(params, stream); + selective_scan_fwd_launch<64, 8, input_t, weight_t, state_t>(params, stream); } else if (params.seqlen <= 1024) { - selective_scan_fwd_launch<64, 16, input_t, weight_t>(params, stream); + selective_scan_fwd_launch<64, 16, input_t, weight_t, state_t>(params, stream); } else { - selective_scan_fwd_launch<128, 16, input_t, weight_t>(params, stream); + selective_scan_fwd_launch<128, 16, input_t, weight_t, state_t>(params, stream); } #endif } -template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream); -template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream); -template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream); +template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream); +template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream); +template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream); +template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream); +template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream); #define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")") -#define DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(ITYPE, NAME, ...) \ +#define DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(ITYPE, STYPE, NAME, ...) \ if (ITYPE == at::ScalarType::Half) { \ using input_t = at::Half; \ using weight_t = float; \ - __VA_ARGS__(); \ + if (STYPE == at::ScalarType::Half) { \ + using state_t = at::Half; \ + __VA_ARGS__(); \ + } else if (STYPE == at::ScalarType::Float) { \ + using state_t = float; \ + __VA_ARGS__(); \ + } else { \ + AT_ERROR(#NAME, " not implemented for state type '", toString(STYPE), "'"); \ + } \ } else if (ITYPE == at::ScalarType::BFloat16) { \ using input_t = at::BFloat16; \ using weight_t = float; \ - __VA_ARGS__(); \ + if (STYPE == at::ScalarType::BFloat16) { \ + using state_t = at::BFloat16; \ + __VA_ARGS__(); \ + } else if (STYPE == at::ScalarType::Float) { \ + using state_t = float; \ + __VA_ARGS__(); \ + } else { \ + AT_ERROR(#NAME, " not implemented for state type '", toString(STYPE), "'"); \ + } \ } else if (ITYPE == at::ScalarType::Float) { \ using input_t = float; \ using weight_t = float; \ + using state_t = float; \ __VA_ARGS__(); \ } else { \ AT_ERROR(#NAME, " not implemented for input type '", toString(ITYPE), "'"); \ } -template +template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream); void set_ssm_params_fwd(SSMParamsBase ¶ms, @@ -648,7 +668,9 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta, // Right now u has BHL layout and delta has HBL layout, and we want out to have HBL layout at::Tensor out = delta; - TORCH_CHECK(ssm_states.scalar_type() == input_type); + // ssm_states can now be either the same as input_type or float32 + auto state_type = ssm_states.scalar_type(); + TORCH_CHECK(state_type == input_type || state_type == at::ScalarType::Float); TORCH_CHECK(ssm_states.is_cuda()); TORCH_CHECK(ssm_states.stride(-1) == 1); @@ -670,7 +692,7 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta, const at::cuda::OptionalCUDAGuard device_guard(device_of(u)); auto stream = at::cuda::getCurrentCUDAStream().stream(); - DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), "selective_scan_fwd", [&] { - selective_scan_fwd_cuda(params, stream); + DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), ssm_states.scalar_type(), "selective_scan_fwd", [&] { + selective_scan_fwd_cuda(params, stream); }); } diff --git a/csrc/moe/grouped_topk_kernels.cu b/csrc/moe/grouped_topk_kernels.cu index 78f7b3cc1..accbb0985 100644 --- a/csrc/moe/grouped_topk_kernels.cu +++ b/csrc/moe/grouped_topk_kernels.cu @@ -28,6 +28,7 @@ namespace cg = cooperative_groups; namespace vllm { namespace moe { +constexpr float kNegInfinity = INFINITY * -1; constexpr unsigned FULL_WARP_MASK = 0xffffffff; constexpr int32_t WARP_SIZE = 32; constexpr int32_t BLOCK_SIZE = 512; @@ -512,8 +513,8 @@ __global__ void group_idx_and_topk_idx_kernel( warp_id * topk; s_topk_idx += warp_id * topk; - T value = cuda::std::numeric_limits::min(); - T topk_group_value = cuda::std::numeric_limits::min(); + T value = kNegInfinity; + T topk_group_value = kNegInfinity; int32_t num_equalto_topkth_group; #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) @@ -539,11 +540,11 @@ __global__ void group_idx_and_topk_idx_kernel( __syncwarp(); // Ensure all threads have valid data before reduction topk_group_value = cg::reduce(tile, value, cg::greater()); if (value == topk_group_value) { - value = cuda::std::numeric_limits::min(); + value = kNegInfinity; } pre_count_equal_to_top_value = count_equal_to_top_value; count_equal_to_top_value = __popc(__ballot_sync( - FULL_WARP_MASK, (value == cuda::std::numeric_limits::min()))); + FULL_WARP_MASK, (value == cuda_cast(kNegInfinity)))); } num_equalto_topkth_group = target_num_min - pre_count_equal_to_top_value; } @@ -555,7 +556,7 @@ __global__ void group_idx_and_topk_idx_kernel( int count_equalto_topkth_group = 0; bool if_proceed_next_topk = - (topk_group_value != cuda::std::numeric_limits::min()); + (topk_group_value != cuda_cast(kNegInfinity)); if (case_id < num_tokens && if_proceed_next_topk) { for (int i_group = 0; i_group < n_group; i_group++) { if ((group_scores[i_group] > topk_group_value) || @@ -568,7 +569,7 @@ __global__ void group_idx_and_topk_idx_kernel( (i < num_experts_per_group) && isfinite(cuda_cast( scores_with_bias[offset + i])) ? scores_with_bias[offset + i] - : cuda::std::numeric_limits::min(); + : cuda_cast(kNegInfinity); queue.add(candidates, offset + i); } if (group_scores[i_group] == topk_group_value) { diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu index 99c52ef17..cd80bfda7 100644 --- a/csrc/moe/topk_softmax_kernels.cu +++ b/csrc/moe/topk_softmax_kernels.cu @@ -573,7 +573,7 @@ void topk_softmax( stream); } else { - assert(topk_indices.scalar_type() == at::ScalarType::Int64); + TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long); vllm::moe::topkGatingSoftmaxKernelLauncher( gating_output.data_ptr(), topk_weights.data_ptr(), diff --git a/csrc/ops.h b/csrc/ops.h index 86fe848e2..2ee5df4ca 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -92,6 +92,9 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight, void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual, torch::Tensor& weight, double epsilon); +void poly_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight, + torch::Tensor& bias, double epsilon); + void apply_repetition_penalties_(torch::Tensor& logits, const torch::Tensor& prompt_mask, const torch::Tensor& output_mask, @@ -130,6 +133,13 @@ void silu_and_mul(torch::Tensor& out, torch::Tensor& input); void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input, torch::Tensor& scale); +#ifndef USE_ROCM +void silu_and_mul_nvfp4_quant(torch::Tensor& out, + torch::Tensor& output_block_scale, + torch::Tensor& input, + torch::Tensor& input_global_scale); +#endif + void mul_and_silu(torch::Tensor& out, torch::Tensor& input); void gelu_and_mul(torch::Tensor& out, torch::Tensor& input); @@ -346,4 +356,4 @@ void qr_open_handles(fptr_t _fa, const std::vector& handles); void qr_all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out, int64_t quant_level, bool cast_bf2half = false); int64_t qr_max_size(); -#endif \ No newline at end of file +#endif diff --git a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu index fdac47c42..57bcbaae4 100644 --- a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu +++ b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu @@ -11,6 +11,7 @@ #include "core/registration.h" #include "cutlass/cutlass.h" +#include #include "cute/tensor.hpp" #include "cutlass/gemm/collective/collective_builder.hpp" @@ -169,6 +170,11 @@ struct W4A8GemmKernel { int k = A.size(1); int n = B.size(1); + // safely cast group_size to int + TORCH_CHECK(group_size > 0 && group_size <= std::numeric_limits::max(), + "group_size out of supported range for int: ", group_size); + int const group_size_int = static_cast(group_size); + // Allocate output const at::cuda::OptionalCUDAGuard device_guard(device_of(A)); auto device = A.device(); @@ -181,7 +187,7 @@ struct W4A8GemmKernel { auto A_ptr = static_cast(A.const_data_ptr()); auto B_ptr = static_cast(B.const_data_ptr()); auto D_ptr = static_cast(D.data_ptr()); - // can we avoid harcode the 8 here + // can we avoid hardcode the 8 here auto S_ptr = static_cast const*>( group_scales.const_data_ptr()); @@ -192,7 +198,7 @@ struct W4A8GemmKernel { cute::tile_to_shape(LayoutAtomQuant{}, shape_B); // strides - int const scale_k = cutlass::ceil_div(k, group_size); + int const scale_k = cutlass::ceil_div(k, group_size_int); StrideA stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1)); // Reverse stride here due to swap and transpose @@ -211,8 +217,8 @@ struct W4A8GemmKernel { using EpilogueArguments = typename GemmKernelShuffled::EpilogueArguments; MainloopArguments mainloop_arguments{ - B_ptr, layout_B_reordered, A_ptr, stride_A, - S_ptr, stride_S, group_size}; + B_ptr, layout_B_reordered, A_ptr, stride_A, + S_ptr, stride_S, group_size_int}; EpilogueArguments epilogue_arguments{ ChTokScalesEpilogue::prepare_args(channel_scales, token_scales), diff --git a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu new file mode 100644 index 000000000..b4eb141cb --- /dev/null +++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include +#include + +#include +#include "dispatch_utils.h" + +#include "cuda_utils.h" +#include "nvfp4_utils.cuh" + +namespace vllm { + +template +__inline__ __device__ PackedVec compute_silu(PackedVec& vec, + PackedVec& vec2) { + PackedVec result; +#pragma unroll + for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; ++i) { + if constexpr (std::is_same_v) { + half2 val(0.5f, 0.5f); + half2 t0 = __hmul2(vec.elts[i], val); + half2 t1 = __hfma2(h2tanh(t0), val, val); + half2 t2 = __hmul2(vec.elts[i], t1); + result.elts[i] = __hmul2(t2, vec2.elts[i]); + } else { + __nv_bfloat162 val(0.5f, 0.5f); + __nv_bfloat162 t0 = __hmul2(vec.elts[i], val); + __nv_bfloat162 t1 = __hfma2(h2tanh(t0), val, val); + __nv_bfloat162 t2 = __hmul2(vec.elts[i], t1); + result.elts[i] = __hmul2(t2, vec2.elts[i]); + } + } + return result; +} + +// Quantizes the provided PackedVec into the uint32_t output +template +__device__ uint32_t silu_and_cvt_warp_fp16_to_fp4(PackedVec& vec, + PackedVec& vec2, + float SFScaleVal, + uint8_t* SFout) { + PackedVec out_silu = compute_silu(vec, vec2); + // Get absolute maximum values among the local 8 values. + auto localMax = __habs2(out_silu.elts[0]); + +// Local maximum value. +#pragma unroll + for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) { + localMax = __hmax2(localMax, __habs2(out_silu.elts[i])); + } + + // Get the absolute maximum among all 16 values (two threads). + localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax); + // Get the final absolute maximum values. + float vecMax = float(__hmax(localMax.x, localMax.y)); + + // Get the SF (max value of the vector / max value of e2m1). + // maximum value of e2m1 = 6.0. + // TODO: use half as compute data type. + float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f)); + // 8 bits representation of the SF. + uint8_t fp8SFVal; + // Write the SF to global memory (STG.8). + if constexpr (UE8M0_SF) { + // Extract the 8 exponent bits from float32. + // float 32bits = 1 sign bit + 8 exponent bits + 23 mantissa bits. + uint32_t tmp = reinterpret_cast(SFValue) >> 23; + fp8SFVal = tmp & 0xff; + // Convert back to fp32. + reinterpret_cast(SFValue) = tmp << 23; + } else { + // Here SFValue is always positive, so E4M3 is the same as UE4M3. + __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue); + reinterpret_cast<__nv_fp8_e4m3&>(fp8SFVal) = tmp; + // Convert back to fp32. + SFValue = float(tmp); + } + // Get the output scale. + // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) * + // reciprocal(SFScaleVal)) + float outputScale = + SFValue != 0 ? reciprocal_approximate_ftz( + SFValue * reciprocal_approximate_ftz(SFScaleVal)) + : 0.0f; + + if (SFout) { + // Write the SF to global memory (STG.8). + *SFout = fp8SFVal; + } + + // Convert the input to float. + float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2]; + +#pragma unroll + for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) { + if constexpr (std::is_same_v) { + fp2Vals[i] = __half22float2(out_silu.elts[i]); + } else { + fp2Vals[i] = __bfloat1622float2(out_silu.elts[i]); + } + fp2Vals[i].x *= outputScale; + fp2Vals[i].y *= outputScale; + } + + // Convert to e2m1 values. + uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals); + + // Write the e2m1 values to global memory. + return e2m1Vec; +} + +// Use UE4M3 by default. +template +__global__ void __launch_bounds__(1024, 4) + silu_and_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in, + float const* SFScale, uint32_t* out, + uint32_t* SFout) { + using PackedVec = PackedVec; + static constexpr int CVT_FP4_NUM_THREADS_PER_SF = + (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD); + static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD, + "Vec size is not matched."); + + // Get the global scaling factor, which will be applied to the SF. + // Note SFScale is the same as next GEMM's alpha, which is + // (448.f / (Alpha_A / 6.f)). + float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0]; + + // Input tensor row/col loops. + for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) { + for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD; + colIdx += blockDim.x) { + int64_t inOffset = + rowIdx * (numCols * 2 / CVT_FP4_ELTS_PER_THREAD) + colIdx; + int64_t inOffset2 = rowIdx * (numCols * 2 / CVT_FP4_ELTS_PER_THREAD) + + numCols / CVT_FP4_ELTS_PER_THREAD + colIdx; + PackedVec in_vec = reinterpret_cast(in)[inOffset]; + PackedVec in_vec2 = reinterpret_cast(in)[inOffset2]; + + // Get the output tensor offset. + // Same as inOffset because 8 elements are packed into one uint32_t. + int64_t outOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx; + ; + auto& out_pos = out[outOffset]; + + auto sf_out = + cvt_quant_to_fp4_get_sf_out_offset( + rowIdx, colIdx, numCols, SFout); + + out_pos = silu_and_cvt_warp_fp16_to_fp4( + in_vec, in_vec2, SFScaleVal, sf_out); + } + } +} + +} // namespace vllm + +void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output, // [..., d] + torch::Tensor& output_sf, + torch::Tensor& input, // [..., 2 * d] + torch::Tensor& input_sf) { + int32_t m = input.size(0); + int32_t n = input.size(1) / 2; + + TORCH_CHECK(n % 16 == 0, "The N dimension must be multiple of 16."); + TORCH_CHECK(input.scalar_type() == at::ScalarType::Half || + input.scalar_type() == at::ScalarType::BFloat16, + "Unsupported input data type for quantize_to_fp4."); + + int multiProcessorCount = + get_device_attribute(cudaDevAttrMultiProcessorCount, -1); + + auto input_sf_ptr = static_cast(input_sf.data_ptr()); + auto sf_out = static_cast(output_sf.data_ptr()); + auto output_ptr = static_cast(output.data_ptr()); + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); + auto stream = at::cuda::getCurrentCUDAStream(input.get_device()); + dim3 block(std::min(int(n / ELTS_PER_THREAD), 1024)); + int const numBlocksPerSM = 2048 / block.x; + dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM)); + + VLLM_DISPATCH_HALF_TYPES( + input.scalar_type(), "silu_and_mul_nvfp4_quant_kernel", [&] { + using cuda_type = vllm::CUDATypeConverter::Type; + auto input_ptr = static_cast(input.data_ptr()); + vllm::silu_and_cvt_fp16_to_fp4<<>>( + m, n, input_ptr, input_sf_ptr, + reinterpret_cast(output_ptr), + reinterpret_cast(sf_out)); + }); +} diff --git a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu index 03db5cc19..2c8df6144 100644 --- a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu +++ b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #include #include diff --git a/csrc/quantization/fp4/nvfp4_experts_quant.cu b/csrc/quantization/fp4/nvfp4_experts_quant.cu index 190d66f31..ce3ba2c19 100644 --- a/csrc/quantization/fp4/nvfp4_experts_quant.cu +++ b/csrc/quantization/fp4/nvfp4_experts_quant.cu @@ -1,247 +1,42 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #include +#include +#include + #include #include -#include #include +#include "dispatch_utils.h" -template -struct TypeConverter { - using Type = half2; -}; // keep for generality +#include "nvfp4_utils.cuh" -template <> -struct TypeConverter { - using Type = half; -}; - -template <> -struct TypeConverter { - using Type = half2; -}; - -template <> -struct TypeConverter<__nv_bfloat162> { - using Type = __nv_bfloat16; -}; - -template <> -struct TypeConverter<__nv_bfloat16> { - using Type = __nv_bfloat162; -}; - -#define ELTS_PER_THREAD 8 - -constexpr int CVT_FP4_ELTS_PER_THREAD = 8; -constexpr int CVT_FP4_SF_VEC_SIZE = 16; - -// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t). -inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) { -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) - uint32_t val; - asm volatile( - "{\n" - ".reg .b8 byte0;\n" - ".reg .b8 byte1;\n" - ".reg .b8 byte2;\n" - ".reg .b8 byte3;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte0, %2, %1;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte1, %4, %3;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte2, %6, %5;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte3, %8, %7;\n" - "mov.b32 %0, {byte0, byte1, byte2, byte3};\n" - "}" - : "=r"(val) - : "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]), - "f"(array[4]), "f"(array[5]), "f"(array[6]), "f"(array[7])); - return val; -#else - return 0; -#endif -} - -// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t). -inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) { -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) - uint32_t val; - asm volatile( - "{\n" - ".reg .b8 byte0;\n" - ".reg .b8 byte1;\n" - ".reg .b8 byte2;\n" - ".reg .b8 byte3;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte0, %2, %1;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte1, %4, %3;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte2, %6, %5;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte3, %8, %7;\n" - "mov.b32 %0, {byte0, byte1, byte2, byte3};\n" - "}" - : "=r"(val) - : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y), - "f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y)); - return val; -#else - return 0; -#endif -} - -// Fast reciprocal. -inline __device__ float reciprocal_approximate_ftz(float a) { - float b; - asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a)); - return b; -} - -template -__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx, - int numCols, - SFType* SFout) { -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) - static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 || - CVT_FP4_NUM_THREADS_PER_SF == 2); - - // One pair of threads write one SF to global memory. - // TODO: stage through smem for packed STG.32 - // is it better than STG.8 from 4 threads ? - if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) { - // SF vector index (16 elements share one SF in the K dimension). - int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF; - int32_t mIdx = rowIdx; - - // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)] - // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx] - - int32_t mTileIdx = mIdx / (32 * 4); - // SF vector size 16. - int factor = CVT_FP4_SF_VEC_SIZE * 4; - int32_t numKTiles = (numCols + factor - 1) / factor; - int64_t mTileStride = numKTiles * 32 * 4 * 4; - - int32_t kTileIdx = (kIdx / 4); - int64_t kTileStride = 32 * 4 * 4; - - // M tile layout [32, 4] is column-major. - int32_t outerMIdx = (mIdx % 32); - int64_t outerMStride = 4 * 4; - - int32_t innerMIdx = (mIdx % (32 * 4)) / 32; - int64_t innerMStride = 4; - - int32_t innerKIdx = (kIdx % 4); - int64_t innerKStride = 1; - - // Compute the global offset. - int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride + - outerMIdx * outerMStride + innerMIdx * innerMStride + - innerKIdx * innerKStride; - - return reinterpret_cast(SFout) + SFOffset; - } -#endif - return nullptr; -} - -// Define a 16 bytes packed data type. -template -struct PackedVec { - typename TypeConverter::Type elts[4]; -}; - -template <> -struct PackedVec<__nv_fp8_e4m3> { - __nv_fp8x2_e4m3 elts[8]; -}; - -// Quantizes the provided PackedVec into the uint32_t output -template -__device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec& vec, float SFScaleVal, - uint8_t* SFout) { -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) - // Get absolute maximum values among the local 8 values. - auto localMax = __habs2(vec.elts[0]); - - // Local maximum value. - #pragma unroll - for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) { - localMax = __hmax2(localMax, __habs2(vec.elts[i])); - } - - // Get the absolute maximum among all 16 values (two threads). - localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax); - // Get the final absolute maximum values. - float vecMax = float(__hmax(localMax.x, localMax.y)); - - // Get the SF (max value of the vector / max value of e2m1). - // maximum value of e2m1 = 6.0. - // TODO: use half as compute data type. - float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f)); - // 8 bits representation of the SF. - uint8_t fp8SFVal; - // Write the SF to global memory (STG.8). - if constexpr (UE8M0_SF) { - // Extract the 8 exponent bits from float32. - // float 32bits = 1 sign bit + 8 exponent bits + 23 mantissa bits. - uint32_t tmp = reinterpret_cast(SFValue) >> 23; - fp8SFVal = tmp & 0xff; - // Convert back to fp32. - reinterpret_cast(SFValue) = tmp << 23; - } else { - // Here SFValue is always positive, so E4M3 is the same as UE4M3. - __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue); - reinterpret_cast<__nv_fp8_e4m3&>(fp8SFVal) = tmp; - // Convert back to fp32. - SFValue = float(tmp); - } - // Get the output scale. - // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) * - // reciprocal(SFScaleVal)) - float outputScale = - SFValue != 0 ? reciprocal_approximate_ftz( - SFValue * reciprocal_approximate_ftz(SFScaleVal)) - : 0.0f; - - if (SFout) { - // Write the SF to global memory (STG.8). - *SFout = fp8SFVal; - } - - // Convert the input to float. - float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2]; - - #pragma unroll - for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) { - if constexpr (std::is_same_v) { - fp2Vals[i] = __half22float2(vec.elts[i]); - } else { - fp2Vals[i] = __bfloat1622float2(vec.elts[i]); - } - fp2Vals[i].x *= outputScale; - fp2Vals[i].y *= outputScale; - } - - // Convert to e2m1 values. - uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals); - - // Write the e2m1 values to global memory. - return e2m1Vec; -#else - return 0; -#endif -} +namespace vllm { // Use UE4M3 by default. template -__global__ void -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) -__launch_bounds__(512, 4) cvt_fp16_to_fp4( -#else -cvt_fp16_to_fp4( -#endif - int32_t numRows, int32_t numCols, Type const* in, float const* SFScale, - uint32_t* out, uint32_t* SFout, uint32_t* input_offset_by_experts, - uint32_t* output_scale_offset_by_experts, int n_experts, bool low_latency) { -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) +__global__ void __launch_bounds__(512, 4) + cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in, + float const* SFScale, uint32_t* out, uint32_t* SFout, + uint32_t* input_offset_by_experts, + uint32_t* output_scale_offset_by_experts, int n_experts, + bool low_latency) { using PackedVec = PackedVec; static constexpr int CVT_FP4_NUM_THREADS_PER_SF = (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD); @@ -299,8 +94,8 @@ cvt_fp16_to_fp4( &input_offset_by_experts[chunk_start + 12])); local_offsets[16] = __ldca(&input_offset_by_experts[chunk_start + 16]); - // Check against the 16 loaded offsets - #pragma unroll +// Check against the 16 loaded offsets +#pragma unroll for (int i = 0; i < 16; i++) { if (rowIdx >= local_offsets[i] && rowIdx < local_offsets[i + 1]) { rowIdx_in_expert = rowIdx - local_offsets[i]; @@ -330,21 +125,15 @@ cvt_fp16_to_fp4( out_pos = cvt_warp_fp16_to_fp4(in_vec, SFScaleVal, sf_out); } -#endif } // Kernel for LARGE_M_TOPK = true (large m_topk optimized version) template -__global__ void -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) -__launch_bounds__(1024, 4) cvt_fp16_to_fp4( -#else -cvt_fp16_to_fp4( -#endif - int32_t numRows, int32_t numCols, Type const* in, float const* SFScale, - uint32_t* out, uint32_t* SFout, uint32_t* input_offset_by_experts, - uint32_t* output_scale_offset_by_experts, int n_experts) { -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) +__global__ void __launch_bounds__(1024, 4) + cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in, + float const* SFScale, uint32_t* out, uint32_t* SFout, + uint32_t* input_offset_by_experts, + uint32_t* output_scale_offset_by_experts, int n_experts) { using PackedVec = PackedVec; static constexpr int CVT_FP4_NUM_THREADS_PER_SF = (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD); @@ -425,7 +214,6 @@ cvt_fp16_to_fp4( out_pos = cvt_warp_fp16_to_fp4(in_vec, SFScaleVal, sf_out); } -#endif } template @@ -501,6 +289,8 @@ void quant_impl(void* output, void* output_scale, void* input, } } +} // namespace vllm + /*Quantization entry for fp4 experts quantization*/ #define CHECK_TH_CUDA(x, m) TORCH_CHECK(x.is_cuda(), m, "must be a CUDA tensor") #define CHECK_CONTIGUOUS(x, m) \ @@ -560,23 +350,17 @@ void scaled_fp4_experts_quant_sm100a( // 4 means 4 fp8 values are packed into one int32 TORCH_CHECK(output_scale.size(1) * 4 == padded_k); - auto in_dtype = input.dtype(); const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(input.get_device()); - if (in_dtype == at::ScalarType::Half) { - quant_impl(output.data_ptr(), output_scale.data_ptr(), - input.data_ptr(), input_global_scale.data_ptr(), - input_offset_by_experts.data_ptr(), - output_scale_offset_by_experts.data_ptr(), m_topk, k, - n_experts, stream); - } else if (in_dtype == at::ScalarType::BFloat16) { - quant_impl<__nv_bfloat16>(output.data_ptr(), output_scale.data_ptr(), - input.data_ptr(), input_global_scale.data_ptr(), - input_offset_by_experts.data_ptr(), - output_scale_offset_by_experts.data_ptr(), m_topk, - k, n_experts, stream); - } else { - TORCH_CHECK(false, "Expected input data type to be half or bfloat16"); - } + + VLLM_DISPATCH_HALF_TYPES( + input.scalar_type(), "nvfp4_experts_quant_kernel", [&] { + using cuda_type = vllm::CUDATypeConverter::Type; + vllm::quant_impl( + output.data_ptr(), output_scale.data_ptr(), input.data_ptr(), + input_global_scale.data_ptr(), input_offset_by_experts.data_ptr(), + output_scale_offset_by_experts.data_ptr(), m_topk, k, n_experts, + stream); + }); } diff --git a/csrc/quantization/fp4/nvfp4_quant_entry.cu b/csrc/quantization/fp4/nvfp4_quant_entry.cu index 1b61bd451..c2b39e543 100644 --- a/csrc/quantization/fp4/nvfp4_quant_entry.cu +++ b/csrc/quantization/fp4/nvfp4_quant_entry.cu @@ -32,6 +32,14 @@ void scaled_fp4_experts_quant_sm100a( torch::Tensor const& output_scale_offset_by_experts); #endif +#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \ + (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120) +void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output, + torch::Tensor& output_sf, + torch::Tensor& input, + torch::Tensor& input_sf); +#endif + void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input, torch::Tensor& output_sf, torch::Tensor const& input_sf) { #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \ @@ -54,3 +62,13 @@ void scaled_fp4_experts_quant( TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 experts quantization kernel"); } + +void silu_and_mul_nvfp4_quant(torch::Tensor& output, torch::Tensor& output_sf, + torch::Tensor& input, torch::Tensor& input_sf) { +#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \ + (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120) + return silu_and_mul_nvfp4_quant_sm1xxa(output, output_sf, input, input_sf); +#endif + TORCH_CHECK_NOT_IMPLEMENTED( + false, "No compiled silu_and_mul nvfp4 quantization kernel"); +} diff --git a/csrc/quantization/fp4/nvfp4_quant_kernels.cu b/csrc/quantization/fp4/nvfp4_quant_kernels.cu index 4e080de15..0c1b9ef06 100644 --- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu +++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu @@ -23,245 +23,18 @@ #include #include +#include "dispatch_utils.h" #include "cuda_utils.h" +#include "nvfp4_utils.cuh" -// Get type2 from type or vice versa (applied to half and bfloat16) -template -struct TypeConverter { - using Type = half2; -}; // keep for generality - -template <> -struct TypeConverter { - using Type = half; -}; - -template <> -struct TypeConverter { - using Type = half2; -}; - -template <> -struct TypeConverter<__nv_bfloat162> { - using Type = __nv_bfloat16; -}; - -template <> -struct TypeConverter<__nv_bfloat16> { - using Type = __nv_bfloat162; -}; - -#define ELTS_PER_THREAD 8 - -constexpr int CVT_FP4_ELTS_PER_THREAD = 8; -constexpr int CVT_FP4_SF_VEC_SIZE = 16; - -// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t). -inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) { -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) - uint32_t val; - asm volatile( - "{\n" - ".reg .b8 byte0;\n" - ".reg .b8 byte1;\n" - ".reg .b8 byte2;\n" - ".reg .b8 byte3;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte0, %2, %1;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte1, %4, %3;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte2, %6, %5;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte3, %8, %7;\n" - "mov.b32 %0, {byte0, byte1, byte2, byte3};\n" - "}" - : "=r"(val) - : "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]), - "f"(array[4]), "f"(array[5]), "f"(array[6]), "f"(array[7])); - return val; -#else - return 0; -#endif -} - -// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t). -inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) { -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) - uint32_t val; - asm volatile( - "{\n" - ".reg .b8 byte0;\n" - ".reg .b8 byte1;\n" - ".reg .b8 byte2;\n" - ".reg .b8 byte3;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte0, %2, %1;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte1, %4, %3;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte2, %6, %5;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte3, %8, %7;\n" - "mov.b32 %0, {byte0, byte1, byte2, byte3};\n" - "}" - : "=r"(val) - : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y), - "f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y)); - return val; -#else - return 0; -#endif -} - -// Fast reciprocal. -inline __device__ float reciprocal_approximate_ftz(float a) { - float b; - asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a)); - return b; -} - -template -__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx, - int numCols, - SFType* SFout) { -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) - static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 || - CVT_FP4_NUM_THREADS_PER_SF == 2); - - // One pair of threads write one SF to global memory. - // TODO: stage through smem for packed STG.32 - // is it better than STG.8 from 4 threads ? - if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) { - // SF vector index (16 elements share one SF in the K dimension). - int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF; - int32_t mIdx = rowIdx; - - // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)] - // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx] - - int32_t mTileIdx = mIdx / (32 * 4); - // SF vector size 16. - int factor = CVT_FP4_SF_VEC_SIZE * 4; - int32_t numKTiles = (numCols + factor - 1) / factor; - int64_t mTileStride = numKTiles * 32 * 4 * 4; - - int32_t kTileIdx = (kIdx / 4); - int64_t kTileStride = 32 * 4 * 4; - - // M tile layout [32, 4] is column-major. - int32_t outerMIdx = (mIdx % 32); - int64_t outerMStride = 4 * 4; - - int32_t innerMIdx = (mIdx % (32 * 4)) / 32; - int64_t innerMStride = 4; - - int32_t innerKIdx = (kIdx % 4); - int64_t innerKStride = 1; - - // Compute the global offset. - int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride + - outerMIdx * outerMStride + innerMIdx * innerMStride + - innerKIdx * innerKStride; - - return reinterpret_cast(SFout) + SFOffset; - } -#endif - return nullptr; -} - -// Define a 16 bytes packed data type. -template -struct PackedVec { - typename TypeConverter::Type elts[4]; -}; - -template <> -struct PackedVec<__nv_fp8_e4m3> { - __nv_fp8x2_e4m3 elts[8]; -}; - -// Quantizes the provided PackedVec into the uint32_t output -template -__device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec& vec, float SFScaleVal, - uint8_t* SFout) { -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) - // Get absolute maximum values among the local 8 values. - auto localMax = __habs2(vec.elts[0]); - - // Local maximum value. - #pragma unroll - for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) { - localMax = __hmax2(localMax, __habs2(vec.elts[i])); - } - - // Get the absolute maximum among all 16 values (two threads). - localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax); - // Get the final absolute maximum values. - float vecMax = float(__hmax(localMax.x, localMax.y)); - - // Get the SF (max value of the vector / max value of e2m1). - // maximum value of e2m1 = 6.0. - // TODO: use half as compute data type. - float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f)); - // 8 bits representation of the SF. - uint8_t fp8SFVal; - // Write the SF to global memory (STG.8). - if constexpr (UE8M0_SF) { - // Extract the 8 exponent bits from float32. - // float 32bits = 1 sign bit + 8 exponent bits + 23 mantissa bits. - uint32_t tmp = reinterpret_cast(SFValue) >> 23; - fp8SFVal = tmp & 0xff; - // Convert back to fp32. - reinterpret_cast(SFValue) = tmp << 23; - } else { - // Here SFValue is always positive, so E4M3 is the same as UE4M3. - __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue); - reinterpret_cast<__nv_fp8_e4m3&>(fp8SFVal) = tmp; - // Convert back to fp32. - SFValue = float(tmp); - } - // Get the output scale. - // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) * - // reciprocal(SFScaleVal)) - float outputScale = - SFValue != 0 ? reciprocal_approximate_ftz( - SFValue * reciprocal_approximate_ftz(SFScaleVal)) - : 0.0f; - - if (SFout) { - // Write the SF to global memory (STG.8). - *SFout = fp8SFVal; - } - - // Convert the input to float. - float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2]; - - #pragma unroll - for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) { - if constexpr (std::is_same_v) { - fp2Vals[i] = __half22float2(vec.elts[i]); - } else { - fp2Vals[i] = __bfloat1622float2(vec.elts[i]); - } - fp2Vals[i].x *= outputScale; - fp2Vals[i].y *= outputScale; - } - - // Convert to e2m1 values. - uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals); - - // Write the e2m1 values to global memory. - return e2m1Vec; -#else - return 0; -#endif -} +namespace vllm { // Use UE4M3 by default. template -__global__ void -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) -__launch_bounds__(512, 4) cvt_fp16_to_fp4( -#else -cvt_fp16_to_fp4( -#endif - int32_t numRows, int32_t numCols, Type const* in, float const* SFScale, - uint32_t* out, uint32_t* SFout) { -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) +__global__ void __launch_bounds__(512, 4) + cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in, + float const* SFScale, uint32_t* out, uint32_t* SFout) { using PackedVec = PackedVec; static constexpr int CVT_FP4_NUM_THREADS_PER_SF = (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD); @@ -293,7 +66,6 @@ cvt_fp16_to_fp4( cvt_warp_fp16_to_fp4(in_vec, SFScaleVal, sf_out); } } -#endif } template @@ -332,6 +104,8 @@ template void invokeFP4Quantization(int m, int n, __nv_bfloat16 const* input, int multiProcessorCount, cudaStream_t stream); +} // namespace vllm + void scaled_fp4_quant_sm1xxa(torch::Tensor const& output, torch::Tensor const& input, torch::Tensor const& output_sf, @@ -340,6 +114,9 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output, int32_t n = input.size(1); TORCH_CHECK(n % 16 == 0, "The N dimension must be multiple of 16."); + TORCH_CHECK(input.scalar_type() == at::ScalarType::Half || + input.scalar_type() == at::ScalarType::BFloat16, + "Unsupported input data type for quantize_to_fp4."); int multiProcessorCount = get_device_attribute(cudaDevAttrMultiProcessorCount, -1); @@ -353,24 +130,10 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output, // We don't support e8m0 scales at this moment. bool useUE8M0 = false; - switch (input.scalar_type()) { - case torch::kHalf: { - auto input_ptr = reinterpret_cast(input.data_ptr()); - invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr, sf_out, - useUE8M0, multiProcessorCount, stream); - break; - } - case torch::kBFloat16: { - auto input_ptr = reinterpret_cast<__nv_bfloat16 const*>(input.data_ptr()); - invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr, sf_out, - useUE8M0, multiProcessorCount, stream); - break; - } - default: { - std::cerr << "Observing: " << input.scalar_type() - << " for the input datatype which is invalid"; - throw std::runtime_error( - "Unsupported input data type for quantize_to_fp4."); - } - } + VLLM_DISPATCH_HALF_TYPES(input.scalar_type(), "nvfp4_quant_kernel", [&] { + using cuda_type = vllm::CUDATypeConverter::Type; + auto input_ptr = static_cast(input.data_ptr()); + vllm::invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr, + sf_out, useUE8M0, multiProcessorCount, stream); + }); } diff --git a/csrc/quantization/fp4/nvfp4_utils.cuh b/csrc/quantization/fp4/nvfp4_utils.cuh new file mode 100644 index 000000000..48e4959de --- /dev/null +++ b/csrc/quantization/fp4/nvfp4_utils.cuh @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#define ELTS_PER_THREAD 8 + +constexpr int CVT_FP4_ELTS_PER_THREAD = 8; +constexpr int CVT_FP4_SF_VEC_SIZE = 16; + +namespace vllm { + +// Convert PyTorch cpp type to CUDA type +template +struct CUDATypeConverter { + using Type = T; +}; + +template <> +struct CUDATypeConverter { + using Type = half; +}; + +template <> +struct CUDATypeConverter { + using Type = __nv_bfloat16; +}; + +// Get type2 from type or vice versa (applied to half and bfloat16) +template +struct TypeConverter { + using Type = half2; +}; // keep for generality + +template <> +struct TypeConverter { + using Type = half; +}; + +template <> +struct TypeConverter { + using Type = half2; +}; + +template <> +struct TypeConverter<__nv_bfloat162> { + using Type = __nv_bfloat16; +}; + +template <> +struct TypeConverter<__nv_bfloat16> { + using Type = __nv_bfloat162; +}; + +// Define a 16 bytes packed data type. +template +struct PackedVec { + typename TypeConverter::Type elts[4]; +}; + +template <> +struct PackedVec<__nv_fp8_e4m3> { + __nv_fp8x2_e4m3 elts[8]; +}; + +// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t). +inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) { + uint32_t val; + asm volatile( + "{\n" + ".reg .b8 byte0;\n" + ".reg .b8 byte1;\n" + ".reg .b8 byte2;\n" + ".reg .b8 byte3;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte0, %2, %1;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte1, %4, %3;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte2, %6, %5;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte3, %8, %7;\n" + "mov.b32 %0, {byte0, byte1, byte2, byte3};\n" + "}" + : "=r"(val) + : "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]), + "f"(array[4]), "f"(array[5]), "f"(array[6]), "f"(array[7])); + return val; +} + +// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t). +inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) { + uint32_t val; + asm volatile( + "{\n" + ".reg .b8 byte0;\n" + ".reg .b8 byte1;\n" + ".reg .b8 byte2;\n" + ".reg .b8 byte3;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte0, %2, %1;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte1, %4, %3;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte2, %6, %5;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte3, %8, %7;\n" + "mov.b32 %0, {byte0, byte1, byte2, byte3};\n" + "}" + : "=r"(val) + : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y), + "f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y)); + return val; +} + +// Fast reciprocal. +inline __device__ float reciprocal_approximate_ftz(float a) { + float b; + asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a)); + return b; +} + +template +__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx, + int numCols, + SFType* SFout) { + static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 || + CVT_FP4_NUM_THREADS_PER_SF == 2); + + // One pair of threads write one SF to global memory. + // TODO: stage through smem for packed STG.32 + // is it better than STG.8 from 4 threads ? + if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) { + // SF vector index (16 elements share one SF in the K dimension). + int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF; + int32_t mIdx = rowIdx; + + // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)] + // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx] + + int32_t mTileIdx = mIdx / (32 * 4); + // SF vector size 16. + int factor = CVT_FP4_SF_VEC_SIZE * 4; + int32_t numKTiles = (numCols + factor - 1) / factor; + int64_t mTileStride = numKTiles * 32 * 4 * 4; + + int32_t kTileIdx = (kIdx / 4); + int64_t kTileStride = 32 * 4 * 4; + + // M tile layout [32, 4] is column-major. + int32_t outerMIdx = (mIdx % 32); + int64_t outerMStride = 4 * 4; + + int32_t innerMIdx = (mIdx % (32 * 4)) / 32; + int64_t innerMStride = 4; + + int32_t innerKIdx = (kIdx % 4); + int64_t innerKStride = 1; + + // Compute the global offset. + int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride + + outerMIdx * outerMStride + innerMIdx * innerMStride + + innerKIdx * innerKStride; + + return reinterpret_cast(SFout) + SFOffset; + } + return nullptr; +} + +// Quantizes the provided PackedVec into the uint32_t output +template +__device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec& vec, float SFScaleVal, + uint8_t* SFout) { + // Get absolute maximum values among the local 8 values. + auto localMax = __habs2(vec.elts[0]); + +// Local maximum value. +#pragma unroll + for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) { + localMax = __hmax2(localMax, __habs2(vec.elts[i])); + } + + // Get the absolute maximum among all 16 values (two threads). + localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax); + // Get the final absolute maximum values. + float vecMax = float(__hmax(localMax.x, localMax.y)); + + // Get the SF (max value of the vector / max value of e2m1). + // maximum value of e2m1 = 6.0. + // TODO: use half as compute data type. + float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f)); + // 8 bits representation of the SF. + uint8_t fp8SFVal; + // Write the SF to global memory (STG.8). + if constexpr (UE8M0_SF) { + // Extract the 8 exponent bits from float32. + // float 32bits = 1 sign bit + 8 exponent bits + 23 mantissa bits. + uint32_t tmp = reinterpret_cast(SFValue) >> 23; + fp8SFVal = tmp & 0xff; + // Convert back to fp32. + reinterpret_cast(SFValue) = tmp << 23; + } else { + // Here SFValue is always positive, so E4M3 is the same as UE4M3. + __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue); + reinterpret_cast<__nv_fp8_e4m3&>(fp8SFVal) = tmp; + // Convert back to fp32. + SFValue = float(tmp); + } + // Get the output scale. + // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) * + // reciprocal(SFScaleVal)) + float outputScale = + SFValue != 0 ? reciprocal_approximate_ftz( + SFValue * reciprocal_approximate_ftz(SFScaleVal)) + : 0.0f; + + if (SFout) { + // Write the SF to global memory (STG.8). + *SFout = fp8SFVal; + } + + // Convert the input to float. + float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2]; + +#pragma unroll + for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) { + if constexpr (std::is_same_v) { + fp2Vals[i] = __half22float2(vec.elts[i]); + } else { + fp2Vals[i] = __bfloat1622float2(vec.elts[i]); + } + fp2Vals[i].x *= outputScale; + fp2Vals[i].y *= outputScale; + } + + // Convert to e2m1 values. + uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals); + + // Write the e2m1 values to global memory. + return e2m1Vec; +} + +} // namespace vllm diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py index 0d14ba159..8fd536ef4 100644 --- a/csrc/quantization/machete/generate.py +++ b/csrc/quantization/machete/generate.py @@ -417,7 +417,7 @@ def create_sources(impl_configs: list[ImplConfig], num_impl_files=8): )) def prepacked_type_key(prepack_type: PrepackTypeConfig): - # For now we we can just use the first accumulator type seen since + # For now, we can just use the first accumulator type seen since # the tensor core shapes/layouts don't vary based on accumulator # type so we can generate less code this way return (prepack_type.a, prepack_type.b_num_bits, prepack_type.convert) diff --git a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh index c841125db..939879b2c 100644 --- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh +++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh @@ -14,9 +14,6 @@ #include "cutlass/epilogue/dispatch_policy.hpp" #include "cutlass/epilogue/collective/collective_builder.hpp" -#include "cutlass_extensions/gemm/dispatch_policy.hpp" -#include "cutlass_extensions/gemm/collective/collective_builder.hpp" - #include "cutlass_gemm_caller.cuh" namespace vllm { diff --git a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh index d50a83ae1..78d5cf37f 100644 --- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh +++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh @@ -14,9 +14,6 @@ #include "cutlass/epilogue/dispatch_policy.hpp" #include "cutlass/epilogue/collective/collective_builder.hpp" -#include "cutlass_extensions/gemm/dispatch_policy.hpp" -#include "cutlass_extensions/gemm/collective/collective_builder.hpp" - #include "cutlass_gemm_caller.cuh" namespace vllm { diff --git a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh index e089c3d4b..862202641 100644 --- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh +++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh @@ -13,27 +13,18 @@ #include "cutlass/epilogue/dispatch_policy.hpp" #include "cutlass/epilogue/collective/collective_builder.hpp" -#include "cutlass_extensions/gemm/dispatch_policy.hpp" -#include "cutlass_extensions/gemm/collective/collective_builder.hpp" - #include "cutlass_gemm_caller.cuh" namespace vllm { using namespace cute; -template > +// clang-format off +template struct cutlass_3x_gemm_fp8_blockwise { - using GroupSizeM = Int; - using GroupSizeN = Int; - using GroupSizeK = Int; - using TileSizeM = Int; - - static_assert(TileSizeM_ % GroupSizeM_ == 0, - "TileSizeM must be a multiple of GroupSizeM"); - using ElementAB = cutlass::float_e4m3_t; using ElementA = ElementAB; @@ -45,52 +36,67 @@ struct cutlass_3x_gemm_fp8_blockwise { static constexpr int AlignmentB = 128 / cutlass::sizeof_bits::value; using ElementD = OutType; - using StrideD = Stride, Int<0>>; + using LayoutD = cutlass::layout::RowMajor; static constexpr int AlignmentD = 128 / cutlass::sizeof_bits::value; - using ElementC = void; - using StrideC = StrideD; + using ElementC = void; // TODO: support bias + using LayoutC = LayoutD; static constexpr int AlignmentC = AlignmentD; using ElementAccumulator = float; - using ElementBlockScale = float; using ElementCompute = float; + using ElementBlockScale = float; + + using ScaleConfig = cutlass::detail::Sm90BlockwiseScaleConfig< + ScaleGranularityM, ScaleGranularityN, ScaleGranularityK>; + + using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA()); + using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB()); + using ArchTag = cutlass::arch::Sm90; using OperatorClass = cutlass::arch::OpClassTensorOp; - using TileShape = Shape; - using KernelSchedule = cutlass::gemm:: - KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum< - GroupSizeM_>; - using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative; - using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto; + static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest; + using ElementScalar = float; + using DefaultOperation = cutlass::epilogue::fusion::LinearCombination; + using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< + ArchTag, + OperatorClass, + MmaTileShape, + ClusterShape, + cutlass::epilogue::collective::EpilogueTileAuto, + ElementAccumulator, + ElementCompute, + ElementC, + LayoutC, + AlignmentC, + ElementD, + LayoutD, + AlignmentD, + EpilogueScheduler, + DefaultOperation + >::CollectiveOp; - using StoreEpilogueCompute = typename cutlass::epilogue::fusion::Sm90EVT< - cutlass::epilogue::fusion::Sm90AccFetch>; - - using CollectiveEpilogue = - typename cutlass::epilogue::collective::CollectiveBuilder< - ArchTag, OperatorClass, TileShape, ClusterShape, EpilogueTileType, - ElementAccumulator, ElementCompute, ElementC, StrideC, AlignmentC, - ElementD, StrideD, AlignmentD, EpilogueSchedule, - StoreEpilogueCompute>::CollectiveOp; - - using CollectiveMainloop = - typename cutlass::gemm::collective::CollectiveBuilder< - ArchTag, OperatorClass, ElementA, LayoutA, AlignmentA, ElementB, - LayoutB, AlignmentB, ElementAccumulator, TileShape, ClusterShape, - cutlass::gemm::collective::StageCountAutoCarveout( - sizeof(typename CollectiveEpilogue::SharedStorage))>, - KernelSchedule>::CollectiveOp; + using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder< + ArchTag, + OperatorClass, + ElementA, + cute::tuple, + AlignmentA, + ElementB, + cute::tuple, + AlignmentB, + ElementAccumulator, + MmaTileShape, + ClusterShape, + cutlass::gemm::collective::StageCountAutoCarveout(sizeof(typename CollectiveEpilogue::SharedStorage))>, + MainloopScheduler + >::CollectiveOp; using KernelType = enable_sm90_or_later, CollectiveMainloop, CollectiveEpilogue, - SchedulerType>>; + Shape, CollectiveMainloop, CollectiveEpilogue>>; struct GemmKernel : public KernelType {}; - - using StrideA = typename GemmKernel::StrideA; - using StrideB = typename GemmKernel::StrideB; }; template @@ -99,76 +105,54 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& a_scales, torch::Tensor const& b_scales) { using GemmKernel = typename Gemm::GemmKernel; + using StrideA = typename Gemm::GemmKernel::StrideA; + using StrideB = typename Gemm::GemmKernel::StrideB; + using StrideD = typename Gemm::GemmKernel::StrideD; + using StrideC = typename Gemm::GemmKernel::StrideC; + using LayoutSFA = typename Gemm::LayoutSFA; + using LayoutSFB = typename Gemm::LayoutSFB; + using ScaleConfig = typename Gemm::ScaleConfig; using ElementAB = typename Gemm::ElementAB; using ElementD = typename Gemm::ElementD; - auto prob_shape = c3x::get_problem_shape(a, b); - int32_t m = get<0>(prob_shape), n = get<1>(prob_shape), - k = get<2>(prob_shape); + int32_t m = a.size(0), n = b.size(1), k = a.size(1); - int64_t lda = a.stride(0); - int64_t ldb = b.stride(1); - int64_t ldc = out.stride(0); + TORCH_CHECK(m % 4 == 0, "m must be divisible by 4"); - using StrideA = Stride, int64_t>; - using StrideB = Stride, int64_t>; - using StrideC = typename Gemm::StrideC; + StrideA a_stride; + StrideB b_stride; + StrideC c_stride; + a_stride = + cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1)); + b_stride = + cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1)); + c_stride = + cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(m, n, 1)); - StrideA a_stride{lda, Int<1>{}, 0}; - StrideB b_stride{ldb, Int<1>{}, 0}; - StrideC c_stride{ldc, Int<1>{}, Int<0>{}}; + LayoutSFA layout_SFA = + ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1)); + LayoutSFB layout_SFB = + ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1)); auto a_ptr = static_cast(a.data_ptr()); auto b_ptr = static_cast(b.data_ptr()); auto a_scales_ptr = static_cast(a_scales.data_ptr()); auto b_scales_ptr = static_cast(b_scales.data_ptr()); - // Check is the t is contiguous and is 1D or 2D with one of the dimensions - // being 1 (i.e. a row or column vector) - auto is_contiguous_vector = [](const torch::Tensor& t) { - auto t_sizes = t.sizes(); - return t.is_contiguous() && - (t.dim() == 1 || - (t.dim() == 2 && - *std::min_element(t_sizes.begin(), t_sizes.end()) == 1)); - }; - - // TODO(lucas): lets clean-up the kernel so that we pass in Strides so - // we don't have to deal with enforcing implicit layouts - TORCH_CHECK(a_scales.size(0) == m / Gemm::GroupSizeM::value); - TORCH_CHECK(a_scales.size(1) == k / Gemm::GroupSizeK::value); - TORCH_CHECK(a_scales.stride(0) == 1 || is_contiguous_vector(a_scales), - "a_scales must be M major"); - TORCH_CHECK(b_scales.size(0) == k / Gemm::GroupSizeK::value); - TORCH_CHECK(b_scales.size(1) == n / Gemm::GroupSizeN::value); - TORCH_CHECK(b_scales.stride(0) == 1 || is_contiguous_vector(b_scales), - "b_scales must be K major"); - typename GemmKernel::MainloopArguments mainloop_args{ - a_ptr, a_stride, b_ptr, b_stride, a_scales_ptr, b_scales_ptr}; + auto mainloop_args = [&](){ + return typename GemmKernel::MainloopArguments{ + a_ptr, a_stride, b_ptr, b_stride, + a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB + }; + }(); + auto prob_shape = cute::make_shape(m, n, k, 1); auto c_ptr = static_cast(out.data_ptr()); typename GemmKernel::EpilogueArguments epilogue_args{ {}, c_ptr, c_stride, c_ptr, c_stride}; - - typename GemmKernel::TileSchedulerArguments scheduler; - - static constexpr bool UsesStreamKScheduler = - cute::is_same_v; - - if constexpr (UsesStreamKScheduler) { - using DecompositionMode = typename cutlass::gemm::kernel::detail:: - PersistentTileSchedulerSm90StreamKParams::DecompositionMode; - using ReductionMode = typename cutlass::gemm::kernel::detail:: - PersistentTileSchedulerSm90StreamKParams::ReductionMode; - - scheduler.decomposition_mode = DecompositionMode::StreamK; - scheduler.reduction_mode = ReductionMode::Nondeterministic; - } - c3x::cutlass_gemm_caller(a.device(), prob_shape, mainloop_args, - epilogue_args, scheduler); + epilogue_args); } template @@ -177,18 +161,12 @@ void cutlass_gemm_blockwise_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales) { - auto k = a.size(1); - auto n = b.size(1); - - if (k > 3 * n) { - cutlass_gemm_caller_blockwise>( - out, a, b, a_scales, b_scales); - } else { - cutlass_gemm_caller_blockwise>( - out, a, b, a_scales, b_scales); - } + // TODO: better heuristics + cutlass_gemm_caller_blockwise, + Shape<_1, _2, _1>, cutlass::epilogue::TmaWarpSpecializedCooperative, + cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum>>( + out, a, b, a_scales, b_scales); } } // namespace vllm \ No newline at end of file diff --git a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_helper.hpp b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_helper.hpp index 2ee6a1940..3af59267b 100644 --- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_helper.hpp +++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_helper.hpp @@ -32,7 +32,7 @@ void dispatch_scaled_mm(torch::Tensor& c, torch::Tensor const& a, TORCH_CHECK(a_scales.dim() == 2, "a scale must be 2d tensor."); TORCH_CHECK(b_scales.dim() == 2, "b scale must be 2d tensor."); int32_t version_num = get_sm_version_num(); - if (version_num >= 100) { + if (version_num >= 90) { TORCH_CHECK( a.size(0) == a_scales.size(0) && cuda_utils::ceil_div(a.size(1), int64_t(128)) == a_scales.size(1), @@ -41,32 +41,6 @@ void dispatch_scaled_mm(torch::Tensor& c, torch::Tensor const& a, cuda_utils::ceil_div(b.size(0), int64_t(128)) == b_scales.size(0) && cuda_utils::ceil_div(b.size(1), int64_t(128)) == b_scales.size(1), "b_scale_group_shape must be [128, 128]."); - } else { - // TODO: Remove this after using cutlass sm90 blockwise scaling gemm - // kernel, or introducing ceil_div to the load_init() of mainloop. - using GroupShape = std::array; - auto make_group_shape = [](torch::Tensor const& x, - torch::Tensor const& s) -> GroupShape { - TORCH_CHECK(s.dim() == 2, "cutlass_scaled_mm group scales must be 2D"); - return {cuda_utils::ceil_div(x.size(0), s.size(0)), - cuda_utils::ceil_div(x.size(1), s.size(1))}; - }; - - GroupShape a_scale_group_shape = make_group_shape(a, a_scales); - GroupShape b_scale_group_shape = make_group_shape(b, b_scales); - - // 1x128 per-token group scales for activations - // 128x128 blockwise scales for weights - TORCH_CHECK((a_scale_group_shape == GroupShape{1, 128} && - b_scale_group_shape == GroupShape{128, 128} && - a.dtype() == torch::kFloat8_e4m3fn && - b.dtype() == torch::kFloat8_e4m3fn), - "cutlass_scaled_mm only supports datatype float8_e4m3fn.\n" - "a_scale_group_shape must be [1, 128]. Got: [", - a_scale_group_shape[0], ", ", a_scale_group_shape[1], - "]\n" - "b_scale_group_shape must be [128, 128]. Got: [", - b_scale_group_shape[0], ", ", b_scale_group_shape[1], "]"); } TORCH_CHECK(!bias, "Bias not yet supported blockwise scaled_mm"); diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 7ae054dc1..9e4586e8f 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -115,6 +115,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "silu_and_mul_quant(Tensor! result, Tensor input, Tensor scale) -> ()"); ops.impl("silu_and_mul_quant", torch::kCUDA, &silu_and_mul_quant); +#ifndef USE_ROCM + ops.def( + "silu_and_mul_nvfp4_quant(Tensor! result, Tensor! result_block_scale, " + "Tensor input, Tensor input_global_scale) -> ()"); + ops.impl("silu_and_mul_nvfp4_quant", torch::kCUDA, &silu_and_mul_nvfp4_quant); +#endif + ops.def("mul_and_silu(Tensor! out, Tensor input) -> ()"); ops.impl("mul_and_silu", torch::kCUDA, &mul_and_silu); @@ -161,6 +168,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "float epsilon) -> ()"); ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm); + // Polynomial Normalization. + ops.def( + "poly_norm(Tensor! out, Tensor input, Tensor weight, Tensor bias, float " + "epsilon) -> ()"); + ops.impl("poly_norm", torch::kCUDA, &poly_norm); + // Apply repetition penalties to logits in-place ops.def( "apply_repetition_penalties_(Tensor! logits, Tensor prompt_mask, " @@ -509,10 +522,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // SM100 CUTLASS MLA decode ops.def( - "sm100_cutlass_mla_decode(Tensor! out, Tensor q_nope, Tensor q_pe," - " Tensor kv_c_and_k_pe_cache, Tensor seq_lens," - " Tensor page_table, Tensor workspace, float " - "scale," + "sm100_cutlass_mla_decode(Tensor! out, Tensor! lse, Tensor q_nope," + " Tensor q_pe, Tensor kv_c_and_k_pe_cache," + " Tensor seq_lens, Tensor page_table," + " Tensor workspace, float scale," " int num_kv_splits) -> ()"); // conditionally compiled so impl in source file @@ -702,6 +715,11 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { " Tensor scale, Tensor? seq_starts) -> ()"); cache_ops.impl("gather_and_maybe_dequant_cache", torch::kCUDA, &gather_and_maybe_dequant_cache); + + cache_ops.def( + "cp_gather_cache(Tensor src_cache, Tensor! dst, Tensor block_table, " + "Tensor cu_seq_lens, int batch_size, Tensor? seq_starts) -> ()"); + cache_ops.impl("cp_gather_cache", torch::kCUDA, &cp_gather_cache); } TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) { diff --git a/docker/Dockerfile b/docker/Dockerfile index 2e272cbca..307e9658f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -237,7 +237,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ # Check the size of the wheel if RUN_WHEEL_CHECK is true COPY .buildkite/check-wheel-size.py check-wheel-size.py # sync the default value with .buildkite/check-wheel-size.py -ARG VLLM_MAX_SIZE_MB=400 +ARG VLLM_MAX_SIZE_MB=450 ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB ARG RUN_WHEEL_CHECK=true RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \ @@ -261,6 +261,8 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match" # Use copy mode to avoid hardlink failures with Docker cache mounts ENV UV_LINK_MODE=copy +# Install libnuma-dev, required by fastsafetensors (fixes #20384) +RUN apt-get update && apt-get install -y libnuma-dev && rm -rf /var/lib/apt/lists/* COPY requirements/lint.txt requirements/lint.txt COPY requirements/test.txt requirements/test.txt COPY requirements/dev.txt requirements/dev.txt @@ -373,7 +375,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist # Install FlashInfer from source ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git" # Keep this in sync with "flashinfer" extra in setup.py -ARG FLASHINFER_GIT_REF="v0.2.14.post1" +ARG FLASHINFER_GIT_REF="v0.3.0" # Flag to control whether to compile FlashInfer AOT kernels # Set to "true" to enable AOT compilation: # docker build --build-arg FLASHINFER_AOT_COMPILE=true ... @@ -432,11 +434,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') # Install DeepGEMM from source -ARG DEEPGEMM_GIT_REF="7b6b5563b9d4c1ae07ffbce7f78ad3ac9204827c" +ARG DEEPGEMM_GIT_REF COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh RUN --mount=type=cache,target=/root/.cache/uv \ - VLLM_DOCKER_BUILD_CONTEXT=1 /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" --ref "${DEEPGEMM_GIT_REF}" \ - && rm /tmp/install_deepgemm.sh + VLLM_DOCKER_BUILD_CONTEXT=1 /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} # Install EP kernels(pplx-kernels and DeepEP), NixL COPY tools/ep_kernels/install_python_libraries.sh install_python_libraries.sh @@ -518,7 +519,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ else \ BITSANDBYTES_VERSION="0.46.1"; \ fi; \ - uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3] + uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' boto3 runai-model-streamer runai-model-streamer[s3] ENV VLLM_USAGE_SOURCE production-docker-image diff --git a/docker/Dockerfile.neuron b/docker/Dockerfile.neuron deleted file mode 100644 index 8bc235547..000000000 --- a/docker/Dockerfile.neuron +++ /dev/null @@ -1,56 +0,0 @@ -# default base image -# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx -ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.6.0-neuronx-py310-sdk2.23.0-ubuntu22.04" - -FROM $BASE_IMAGE - -RUN echo "Base image is $BASE_IMAGE" - -# Install some basic utilities -RUN apt-get update && \ - apt-get install -y \ - git \ - python3 \ - python3-pip \ - ffmpeg libsm6 libxext6 libgl1 - -### Mount Point ### -# When launching the container, mount the code directory to /workspace -ARG APP_MOUNT=/workspace -VOLUME [ ${APP_MOUNT} ] -WORKDIR ${APP_MOUNT}/vllm - -RUN python3 -m pip install --upgrade pip -RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas tenacity -RUN python3 -m pip install neuronx-cc==2.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U -RUN python3 -m pip install pytest - -# uninstall transformers-neuronx package explicitly to avoid version conflict -RUN python3 -m pip uninstall -y transformers-neuronx - -COPY . . -ARG GIT_REPO_CHECK=0 -RUN --mount=type=bind,source=.git,target=.git \ - if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi - -RUN python3 -m pip install -U \ - 'cmake>=3.26.1' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ - -r requirements/neuron.txt - -ENV VLLM_TARGET_DEVICE neuron -RUN --mount=type=bind,source=.git,target=.git \ - pip install --no-build-isolation -v -e . - -# install development dependencies (for testing) -RUN python3 -m pip install -e tests/vllm_test_utils - -# install transformers-neuronx package as an optional dependencies (for V0) -# FIXME: `--no-deps` argument is temporarily added to resolve transformers package version conflict -RUN python3 -m pip install transformers-neuronx==0.13.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U --no-deps - -RUN python3 -m pip install sentencepiece transformers==4.48.0 -U - -# overwrite entrypoint to run bash script -RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py - -CMD ["/bin/bash"] diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index f16485732..063fc4969 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -47,6 +47,7 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements /requirements COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples +COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite # ----------------------- @@ -71,7 +72,7 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace RUN cd /vllm-workspace \ && rm -rf vllm \ && python3 -m pip install -e tests/vllm_test_utils \ - && python3 -m pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] \ + && python3 -m pip install lm-eval[api]==0.4.4 \ && python3 -m pip install pytest-shard # ----------------------- @@ -100,8 +101,10 @@ ARG COMMON_WORKDIR # Copy over the benchmark scripts as well COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples +COPY --from=export_vllm /docker ${COMMON_WORKDIR}/vllm/docker ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 +ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1 ENV TOKENIZERS_PARALLELISM=false # ENV that can improve safe tensor loading, and end-to-end time diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base index 3414c0aa8..2ba5461df 100644 --- a/docker/Dockerfile.rocm_base +++ b/docker/Dockerfile.rocm_base @@ -1,18 +1,16 @@ -ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete -ARG HIPBLASLT_BRANCH="db8e93b4" -ARG HIPBLAS_COMMON_BRANCH="7c1566b" +ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.4.1-complete +ARG HIPBLASLT_BRANCH="aa0bda7b" +ARG HIPBLAS_COMMON_BRANCH="9b80ba8e" ARG LEGACY_HIPBLASLT_OPTION= -ARG RCCL_BRANCH="648a58d" -ARG RCCL_REPO="https://github.com/ROCm/rccl" ARG TRITON_BRANCH="e5be006" ARG TRITON_REPO="https://github.com/triton-lang/triton.git" -ARG PYTORCH_BRANCH="295f2ed4" +ARG PYTORCH_BRANCH="f717b2af" ARG PYTORCH_VISION_BRANCH="v0.21.0" -ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git" +ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git" ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" ARG FA_BRANCH="1a7f4dfa" ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git" -ARG AITER_BRANCH="916bf3c" +ARG AITER_BRANCH="4822e675" ARG AITER_REPO="https://github.com/ROCm/aiter.git" FROM ${BASE_IMAGE} AS base @@ -45,7 +43,7 @@ RUN apt-get update -y \ && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ && python3 --version && python3 -m pip --version -RUN pip install -U packaging 'cmake<4' ninja wheel setuptools pybind11 Cython +RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython FROM base AS build_hipblaslt ARG HIPBLASLT_BRANCH @@ -53,6 +51,7 @@ ARG HIPBLAS_COMMON_BRANCH # Set to "--legacy_hipblas_direct" for ROCm<=6.2 ARG LEGACY_HIPBLASLT_OPTION RUN git clone https://github.com/ROCm/hipBLAS-common.git +RUN apt-get remove -y hipblaslt && apt-get autoremove -y && apt-get autoclean -y RUN cd hipBLAS-common \ && git checkout ${HIPBLAS_COMMON_BRANCH} \ && mkdir build \ @@ -69,24 +68,17 @@ RUN cd hipBLASLt \ && make package RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install -FROM base AS build_rccl -ARG RCCL_BRANCH -ARG RCCL_REPO -RUN git clone ${RCCL_REPO} -RUN cd rccl \ - && git checkout ${RCCL_BRANCH} \ - && ./install.sh -p --amdgpu_targets ${PYTORCH_ROCM_ARCH} -RUN mkdir -p /app/install && cp /app/rccl/build/release/*.deb /app/install - FROM base AS build_triton ARG TRITON_BRANCH ARG TRITON_REPO RUN git clone ${TRITON_REPO} RUN cd triton \ && git checkout ${TRITON_BRANCH} \ - && cd python \ - && python3 setup.py bdist_wheel --dist-dir=dist -RUN mkdir -p /app/install && cp /app/triton/python/dist/*.whl /app/install + && if [ ! -f setup.py ]; then cd python; fi \ + && python3 setup.py bdist_wheel --dist-dir=dist \ + && mkdir -p /app/install && cp dist/*.whl /app/install +RUN if [ -d triton/python/triton_kernels ]; then pip install build && cd triton/python/triton_kernels \ + && python3 -m build --wheel && cp dist/*.whl /app/install; fi FROM base AS build_amdsmi RUN cd /opt/rocm/share/amd_smi \ @@ -132,15 +124,25 @@ RUN cd aiter \ RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install +FROM base AS debs +RUN mkdir /app/debs +RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \ + cp /install/*.deb /app/debs +RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \ + cp /install/*.whl /app/debs +RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \ + cp /install/*.whl /app/debs +RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \ + cp /install/*.whl /app/debs +RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \ + cp /install/*.whl /app/debs + FROM base AS final RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \ dpkg -i /install/*deb \ - && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \ - && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status -RUN --mount=type=bind,from=build_rccl,src=/app/install/,target=/install \ - dpkg -i /install/*deb \ - && sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \ - && sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status + && perl -p -i -e 's/, hipblas-common-dev \([^)]*?\), /, /g' /var/lib/dpkg/status \ + && perl -p -i -e 's/, hipblaslt-dev \([^)]*?\), /, /g' /var/lib/dpkg/status \ + && perl -p -i -e 's/, hipblaslt \([^)]*?\), /, /g' /var/lib/dpkg/status RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \ pip install /install/*.whl RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \ @@ -154,8 +156,6 @@ ARG BASE_IMAGE ARG HIPBLAS_COMMON_BRANCH ARG HIPBLASLT_BRANCH ARG LEGACY_HIPBLASLT_OPTION -ARG RCCL_BRANCH -ARG RCCL_REPO ARG TRITON_BRANCH ARG TRITON_REPO ARG PYTORCH_BRANCH @@ -170,8 +170,6 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \ && echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \ && echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \ && echo "LEGACY_HIPBLASLT_OPTION: ${LEGACY_HIPBLASLT_OPTION}" >> /app/versions.txt \ - && echo "RCCL_BRANCH: ${RCCL_BRANCH}" >> /app/versions.txt \ - && echo "RCCL_REPO: ${RCCL_REPO}" >> /app/versions.txt \ && echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \ && echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \ && echo "PYTORCH_BRANCH: ${PYTORCH_BRANCH}" >> /app/versions.txt \ @@ -180,4 +178,4 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \ && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \ && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \ && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \ - && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt + && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt \ No newline at end of file diff --git a/docker/Dockerfile.s390x b/docker/Dockerfile.s390x index 9270b48c5..9942b7626 100644 --- a/docker/Dockerfile.s390x +++ b/docker/Dockerfile.s390x @@ -16,7 +16,8 @@ ENV LANG=C.UTF-8 \ RUN microdnf install -y \ which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \ libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \ - openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy libsndfile && \ + openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy libsndfile \ + clang llvm-devel llvm-static clang-devel && \ microdnf clean all # Python Installation @@ -191,7 +192,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ -DCOMPILER_RT_BUILD_ORC=OFF \ -DCOMPILER_RT_INCLUDE_TESTS=OFF \ ${CMAKE_ARGS} -GNinja ../llvm \ - && ninja install . && \ # build llvmlite cd ../../llvmlite && python setup.py bdist_wheel && \ @@ -200,6 +200,45 @@ RUN --mount=type=cache,target=/root/.cache/uv \ sed -i '/#include "internal\/pycore_atomic.h"/i\#include "dynamic_annotations.h"' numba/_dispatcher.cpp; \ fi && python setup.py bdist_wheel +# Edit aws-lc-sys to support s390x +FROM python-install AS aws-lc-sys-editor +WORKDIR /tmp +ENV CARGO_HOME=/root/.cargo +ENV RUSTUP_HOME=/root/.rustup +ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH" +ARG AWS_LC_VERSION=v0.30.0 +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \ + --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \ + git clone --recursive https://github.com/aws/aws-lc-rs.git && \ + cd aws-lc-rs && \ + git checkout tags/aws-lc-sys/${AWS_LC_VERSION} && \ + git submodule sync && \ + git submodule update --init --recursive && \ + cd aws-lc-sys && \ + sed -i '682 s/strncmp(buf, "-----END ", 9)/memcmp(buf, "-----END ", 9)/' aws-lc/crypto/pem/pem_lib.c && \ + sed -i '712 s/strncmp(buf, "-----END ", 9)/memcmp(buf, "-----END ", 9)/' aws-lc/crypto/pem/pem_lib.c && \ + sed -i '747 s/strncmp(buf, "-----END ", 9)/memcmp(buf, "-----END ", 9)/' aws-lc/crypto/pem/pem_lib.c + +# Build Outlines Core +FROM python-install AS outlines-core-builder +WORKDIR /tmp +ENV CARGO_HOME=/root/.cargo +ENV RUSTUP_HOME=/root/.rustup +ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH" +ARG OUTLINES_CORE_VERSION=0.2.10 +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \ + --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \ + --mount=type=bind,from=aws-lc-sys-editor,source=/tmp/aws-lc-rs/aws-lc-sys,target=/tmp/aws-lc-sys,rw \ + git clone https://github.com/dottxt-ai/outlines-core.git && \ + cd outlines-core && \ + git checkout tags/${OUTLINES_CORE_VERSION} && \ + sed -i "s/version = \"0.0.0\"/version = \"${OUTLINES_CORE_VERSION}\"/" Cargo.toml && \ + echo '[patch.crates-io]' >> Cargo.toml && \ + echo 'aws-lc-sys = { path = "/tmp/aws-lc-sys" }' >> Cargo.toml && \ + uv pip install maturin && \ + python -m maturin build --release --out dist # Final build stage FROM python-install AS vllm-cpu @@ -230,6 +269,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \ --mount=type=bind,from=numba-builder,source=/tmp/llvmlite/dist,target=/tmp/llvmlite-wheels/ \ --mount=type=bind,from=numba-builder,source=/tmp/numba/dist,target=/tmp/numba-wheels/ \ + --mount=type=bind,from=outlines-core-builder,source=/tmp/outlines-core/dist,target=/tmp/outlines-core/dist/ \ sed -i '/^torch/d' requirements/build.txt && \ ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl) && \ VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl) && \ @@ -237,6 +277,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl) && \ LLVM_WHL_FILE=$(ls /tmp/llvmlite-wheels/*.whl) && \ NUMBA_WHL_FILE=$(ls /tmp/numba-wheels/*.whl) && \ + OUTLINES_CORE_WHL_FILE=$(ls /tmp/outlines-core/dist/*.whl) && \ uv pip install -v \ $ARROW_WHL_FILE \ $VISION_WHL_FILE \ @@ -244,6 +285,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ $TORCH_WHL_FILE \ $LLVM_WHL_FILE \ $NUMBA_WHL_FILE \ + $OUTLINES_CORE_WHL_FILE \ --index-strategy unsafe-best-match \ -r requirements/build.txt \ -r requirements/cpu.txt diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu index 65d2e5036..ef4223525 100644 --- a/docker/Dockerfile.xpu +++ b/docker/Dockerfile.xpu @@ -1,12 +1,10 @@ FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu24.04 AS vllm-base -RUN rm /etc/apt/sources.list.d/intel-graphics.list +RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \ + echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \ + add-apt-repository -y ppa:kobuk-team/intel-graphics RUN apt clean && apt-get update -y && \ - apt-get install -y software-properties-common && \ - add-apt-repository ppa:deadsnakes/ppa && \ - apt-get install -y python3.10 python3.10-distutils && \ - curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 && \ apt-get install -y --no-install-recommends --fix-missing \ curl \ ffmpeg \ @@ -17,17 +15,29 @@ RUN apt clean && apt-get update -y && \ libgl1 \ lsb-release \ numactl \ - python3.10-dev \ - wget + wget \ + vim \ + python3.12 \ + python3.12-dev \ + python3-pip +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1 +RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1 -RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 -RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1 +RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing + +RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.4/intel-oneccl-2021.15.4.11_offline.sh +RUN bash intel-oneccl-2021.15.4.11_offline.sh -a --silent --eula accept && echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc +SHELL ["bash", "-c"] +CMD ["bash", "-c", "source /root/.bashrc && exec bash"] WORKDIR /workspace/vllm COPY requirements/xpu.txt /workspace/vllm/requirements/xpu.txt COPY requirements/common.txt /workspace/vllm/requirements/common.txt +# suppress the python externally managed environment error +RUN python3 -m pip config set global.break-system-packages true + RUN --mount=type=cache,target=/root/.cache/pip \ pip install --no-cache-dir \ -r requirements/xpu.txt @@ -54,8 +64,9 @@ FROM vllm-base AS vllm-openai RUN --mount=type=cache,target=/root/.cache/pip \ pip install accelerate hf_transfer pytest pytest_asyncio lm_eval[api] modelscope -ENV VLLM_USAGE_SOURCE production-docker-image \ - TRITON_XPU_PROFILE 1 +RUN --mount=type=cache,target=/root/.cache/pip \ + pip uninstall oneccl oneccl-devel -y + # install development dependencies (for testing) RUN python3 -m pip install -e tests/vllm_test_utils ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] diff --git a/docs/.nav.yml b/docs/.nav.yml index dbac0e12f..c103ed476 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -32,10 +32,7 @@ nav: - models/pooling_models.md - models/extensions - Hardware Supported Models: models/hardware_supported_models - - Features: - - features/compatibility_matrix.md - - features/* - - features/quantization + - Features: features - Developer Guide: - contributing/README.md - General: @@ -47,11 +44,12 @@ nav: - contributing/model/registration.md - contributing/model/tests.md - contributing/model/multimodal.md + - contributing/model/transcription.md - CI: contributing/ci - Design Documents: design - API Reference: - api/README.md - - api/vllm/* + - api/vllm - CLI Reference: cli - Community: - community/* diff --git a/docs/community/meetups.md b/docs/community/meetups.md index 221a7bd96..a3004249b 100644 --- a/docs/community/meetups.md +++ b/docs/community/meetups.md @@ -2,6 +2,8 @@ We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: +- [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ), August 30th 2025. [[Slides]](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA) +- [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet), August 27th 2025. [[Slides]](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing) - [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg), August 23rd 2025. [[Slides]](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH) - [vLLM Korea Meetup](https://luma.com/cgcgprmh), August 19th 2025. [[Slides]](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view). - [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA), August 2nd 2025. [[Slides]](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) [[Recording]](https://www.chaspark.com/#/live/1166916873711665152). diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index b11ccb5c0..c853fcf92 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -174,8 +174,10 @@ Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to u Known supported models: +- GLM-4.5V GLM-4.1V () +- Kimi-VL () - Llama4 () -- MiniCPM-V-4 () +- MiniCPM-V-2.5 or above (, ) - Qwen2.5-VL () - Step3 () @@ -208,7 +210,7 @@ vllm serve Qwen/Qwen2.5-VL-3B-Instruct --api-server-count 4 -dp 2 !!! note API server scale-out disables [multi-modal IPC caching](#ipc-caching) - because it requires a one-to-one correspondance between API and engine core processes. + because it requires a one-to-one correspondence between API and engine core processes. This does not impact [multi-modal processor caching](#processor-caching). @@ -225,7 +227,7 @@ to avoid repeatedly processing the same multi-modal inputs in `BaseMultiModalPro ### IPC Caching Multi-modal IPC caching is automatically enabled when -there is a one-to-one correspondance between API (`P0`) and engine core (`P1`) processes, +there is a one-to-one correspondence between API (`P0`) and engine core (`P1`) processes, to avoid repeatedly transferring the same multi-modal inputs between them. ### Configuration diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md index 2bbed778f..25c2d2955 100644 --- a/docs/contributing/benchmarks.md +++ b/docs/contributing/benchmarks.md @@ -11,9 +11,39 @@ vLLM contains two sets of benchmarks: The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM. +### Manually Trigger the benchmark + +Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite. +For CPU environment, please use the image with "-cpu" postfix. + +Here is an example for docker run command for CPU. + +```bash +docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface -e HF_TOKEN='' --shm-size=16g --name vllm-cpu-ci public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:1da94e673c257373280026f75ceb4effac80e892-cpu +``` + +Then, run below command inside the docker instance. + +```bash +bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +``` + +When run, benchmark script generates results under **benchmark/results** folder, along with the benchmark_results.md and benchmark_results.json. + +#### Runtime environment variables + +- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0. +- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file). +- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file). +- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file). +- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string. +- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string. + +For more results visualization, check the [visualizing the results](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md#visualizing-the-results). + The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm). -More information on the performance benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md). +More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md). [](){ #nightly-benchmarks } diff --git a/docs/contributing/ci/update_pytorch_version.md b/docs/contributing/ci/update_pytorch_version.md index 7ef22d6f8..3dae62dd5 100644 --- a/docs/contributing/ci/update_pytorch_version.md +++ b/docs/contributing/ci/update_pytorch_version.md @@ -90,7 +90,7 @@ address the long build time at its source, the current workaround is to set `VLL to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/use_postmerge_q`) when manually triggering a build on Buildkite. This branch accomplishes two things: -1. Increase the timeout limit to 10 hours so that the build doesn't timeout. +1. Increase the timeout limit to 10 hours so that the build doesn't time out. 2. Allow the compiled artifacts to be written to the vLLM sccache S3 bucket to warm it up so that future builds are faster. diff --git a/docs/contributing/model/README.md b/docs/contributing/model/README.md index 0ca77fa49..6c013738a 100644 --- a/docs/contributing/model/README.md +++ b/docs/contributing/model/README.md @@ -15,6 +15,7 @@ Read through these pages for a step-by-step guide: - [Registering a Model](registration.md) - [Unit Testing](tests.md) - [Multi-Modal Support](multimodal.md) +- [Speech-to-Text Support](transcription.md) !!! tip If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues) diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md index 21b1f21d6..aafdb1058 100644 --- a/docs/contributing/model/basic.md +++ b/docs/contributing/model/basic.md @@ -121,3 +121,31 @@ To support a model with interleaving sliding windows, we need to take care of th - In the modeling code, parse the correct sliding window value for every layer, and pass it to the attention layer's `per_layer_sliding_window` argument. For reference, check [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/model_executor/models/llama.py#L171). With these two steps, interleave sliding windows should work with the model. + +### How to support models that use Mamba? + +We consider 3 different scenarios: + +1. Models that use Mamba layers (either Mamba-1 or Mamba-2) but do not use attention layers. +2. Models that combine Mamba layers (either Mamba-1 or Mamba-2) together with attention layers. +3. Models that combine Mamba-like mechanisms (e.g., Linear Attention, ShortConv) together with attention layers. + +For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](gh-file:vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](gh-file:vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference. +The model should inherit protocol `IsAttentionFree` and also implement class methods `get_mamba_state_dtype_from_config` and `get_mamba_state_shape_from_config` to calculate the state shapes and data types from the config. +For the mamba layers themselves, please use the [`MambaMixer`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes. +Please *do not* use the `MambaCacheManager` (deprecated in V1) or replicate any of the V0-specific code paths in the existing model implementations. +V0-only classes and code will be removed in the very near future. +The model should also be added to the `MODELS_CONFIG_MAP` dictionary in to ensure that the runtime defaults are optimized. + +For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](gh-file:vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](gh-file:vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together). +These models should follow the same instructions as case (1), but they should inherit protocol `IsHybrid` (instead of `IsAttentionFree`) and it is *not* necessary to add them to the `MODELS_CONFIG_MAP` (their runtime defaults will be inferred from the protocol). + +For case (3), we recommend looking at the implementation of [`MiniMaxText01ForCausalLM`](gh-file:vllm/model_executor/models/minimax_text_01.py) or [`Lfm2ForCausalLM`](gh-file:vllm/model_executor/models/lfm2.py) as a reference, which use custom "mamba-like" layers `MiniMaxText01LinearAttention` and `ShortConv` respectively. +Please follow the same guidelines as case (2) for implementing these models. +We use "mamba-like" to refer to layers that posses a state that is updated in-place, rather than being appended-to (like KV cache for attention). +For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`. +It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers. +Please see [`LinearAttentionMetadata`](gh-file:vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](gh-file:v1/attention/backends/short_conv_attn.py) for examples of this. +Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it. +Please see the calls to `direct_register_custom_op` in or for examples of this. +The new custom op should then be added to the list `_attention_ops` in to ensure that piecewise CUDA graphs works as intended. diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md index 76d0f067f..dc742c8fc 100644 --- a/docs/contributing/model/multimodal.md +++ b/docs/contributing/model/multimodal.md @@ -855,7 +855,7 @@ Examples: ### Custom HF processor -Some models don't define a HF processor class on HF Hub. In that case, you can define a custom HF processor that has the same call signature as HF processors and pass it to [_call_hf_processor][vllm.multimodal.processing.BaseMultiModalProcessor._call_hf_processor]. +Some models don't define an HF processor class on HF Hub. In that case, you can define a custom HF processor that has the same call signature as HF processors and pass it to [_call_hf_processor][vllm.multimodal.processing.BaseMultiModalProcessor._call_hf_processor]. Examples: diff --git a/docs/contributing/model/transcription.md b/docs/contributing/model/transcription.md new file mode 100644 index 000000000..62e58e5c6 --- /dev/null +++ b/docs/contributing/model/transcription.md @@ -0,0 +1,276 @@ +# Speech-to-Text (Transcription/Translation) Support + +This document walks you through the steps to add support for speech-to-text (ASR) models to vLLM’s transcription and translation APIs by implementing [SupportsTranscription][vllm.model_executor.models.interfaces.SupportsTranscription]. +Please refer to the [supported models](../../models/supported_models.md#transcription) for further guidance. + +## Update the base vLLM model + +It is assumed you have already implemented your model in vLLM according to the basic model guide. Extend your model with the [SupportsTranscription][vllm.model_executor.models.interfaces.SupportsTranscription] interface and implement the following class attributes and methods. + +### `supported_languages` and `supports_transcription_only` + +Declare supported languages and capabilities: + +- The `supported_languages` mapping is validated at init time. +- Set `supports_transcription_only=True` if the model should not serve text generation (eg Whisper). + +??? code "supported_languages and supports_transcription_only" + ```python + from typing import ClassVar, Mapping, Optional, Literal + import numpy as np + import torch + from torch import nn + + from vllm.config import ModelConfig, SpeechToTextConfig + from vllm.inputs.data import PromptType + from vllm.model_executor.models.interfaces import SupportsTranscription + + class YourASRModel(nn.Module, SupportsTranscription): + # Map of ISO 639-1 language codes to language names + supported_languages: ClassVar[Mapping[str, str]] = { + "en": "English", + "it": "Italian", + # ... add more as needed + } + + # If your model only supports audio-conditioned generation + # (no text-only generation), enable this flag. + supports_transcription_only: ClassVar[bool] = True + ``` + +Provide an ASR configuration via [get_speech_to_text_config][vllm.model_executor.models.interfaces.SupportsTranscription.get_speech_to_text_config]. + +This is for controlling general behavior of the API when serving your model: + +??? code "get_speech_to_text_config()" + ```python + class YourASRModel(nn.Module, SupportsTranscription): + ... + + @classmethod + def get_speech_to_text_config( + cls, + model_config: ModelConfig, + task_type: Literal["transcribe", "translate"], + ) -> SpeechToTextConfig: + return SpeechToTextConfig( + sample_rate=16_000, + max_audio_clip_s=30, + # Set to None to disable server-side chunking if your + # model/processor handles it already + min_energy_split_window_size=None, + ) + ``` + +See [Audio preprocessing and chunking](#audio-preprocessing-and-chunking) for what each field controls. + +Implement the prompt construction via [get_generation_prompt][vllm.model_executor.models.interfaces.SupportsTranscription.get_generation_prompt]. The server passes you the resampled waveform and task parameters; you return a valid [PromptType][vllm.inputs.data.PromptType]. There are two common patterns: + +#### Multimodal LLM with audio embeddings (e.g., Voxtral, Gemma3n) + +Return a dict containing `multi_modal_data` with the audio, and either a `prompt` string or `prompt_token_ids`: + +??? code "get_generation_prompt()" + ```python + class YourASRModel(nn.Module, SupportsTranscription): + ... + + @classmethod + def get_generation_prompt( + cls, + audio: np.ndarray, + stt_config: SpeechToTextConfig, + model_config: ModelConfig, + language: Optional[str], + task_type: Literal["transcribe", "translate"], + request_prompt: str, + to_language: Optional[str], + ) -> PromptType: + # Example with a free-form instruction prompt + task_word = "Transcribe" if task_type == "transcribe" else "Translate" + prompt = ( + "user\n" + f"{task_word} this audio: " + "\nmodel\n" + ) + + return { + "multi_modal_data": {"audio": (audio, stt_config.sample_rate)}, + "prompt": prompt, + } + ``` + + For further clarification on multi modal inputs, please refer to [Multi-Modal Inputs](../../features/multimodal_inputs.md). + +#### Encoder–decoder audio-only (e.g., Whisper) + +Return a dict with separate `encoder_prompt` and `decoder_prompt` entries: + +??? code "get_generation_prompt()" + ```python + class YourASRModel(nn.Module, SupportsTranscription): + ... + + @classmethod + def get_generation_prompt( + cls, + audio: np.ndarray, + stt_config: SpeechToTextConfig, + model_config: ModelConfig, + language: Optional[str], + task_type: Literal["transcribe", "translate"], + request_prompt: str, + to_language: Optional[str], + ) -> PromptType: + if language is None: + raise ValueError("Language must be specified") + + prompt = { + "encoder_prompt": { + "prompt": "", + "multi_modal_data": { + "audio": (audio, stt_config.sample_rate), + }, + }, + "decoder_prompt": ( + (f"<|prev|>{request_prompt}" if request_prompt else "") + + f"<|startoftranscript|><|{language}|>" + + f"<|{task_type}|><|notimestamps|>" + ), + } + return cast(PromptType, prompt) + ``` + +### `validate_language` (optional) + +Language validation via [validate_language][vllm.model_executor.models.interfaces.SupportsTranscription.validate_language] + +If your model requires a language and you want a default, override this method (see Whisper): + +??? code "validate_language()" + ```python + @classmethod + def validate_language(cls, language: Optional[str]) -> Optional[str]: + if language is None: + logger.warning( + "Defaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field.") + language = "en" + return super().validate_language(language) + ``` + +### `get_num_audio_tokens` (optional) + +Token accounting for streaming via [get_num_audio_tokens][vllm.model_executor.models.interfaces.SupportsTranscription.get_num_audio_tokens] + +Provide a fast duration→token estimate to improve streaming usage statistics: + +??? code "get_num_audio_tokens()" + ```python + class YourASRModel(nn.Module, SupportsTranscription): + ... + + @classmethod + def get_num_audio_tokens( + cls, + audio_duration_s: float, + stt_config: SpeechToTextConfig, + model_config: ModelConfig, + ) -> Optional[int]: + # Return None if unknown; otherwise return an estimate. + return int(audio_duration_s * stt_config.sample_rate // 320) # example + ``` + +## Audio preprocessing and chunking + +The API server takes care of basic audio I/O and optional chunking before building prompts: + +- Resampling: Input audio is resampled to `SpeechToTextConfig.sample_rate` using `librosa`. +- Chunking: If `SpeechToTextConfig.allow_audio_chunking` is True and the duration exceeds `max_audio_clip_s`, the server splits the audio into overlapping chunks and generates a prompt per chunk. Overlap is controlled by `overlap_chunk_second`. +- Energy-aware splitting: When `min_energy_split_window_size` is set, the server finds low-energy regions to minimize cutting within words. + +Relevant server logic: + +??? code "_preprocess_speech_to_text()" + ```python + # vllm/entrypoints/openai/speech_to_text.py + async def _preprocess_speech_to_text(...): + language = self.model_cls.validate_language(request.language) + ... + y, sr = librosa.load(bytes_, sr=self.asr_config.sample_rate) + duration = librosa.get_duration(y=y, sr=sr) + do_split_audio = (self.asr_config.allow_audio_chunking + and duration > self.asr_config.max_audio_clip_s) + chunks = [y] if not do_split_audio else self._split_audio(y, int(sr)) + prompts = [] + for chunk in chunks: + prompt = self.model_cls.get_generation_prompt( + audio=chunk, + stt_config=self.asr_config, + model_config=self.model_config, + language=language, + task_type=self.task_type, + request_prompt=request.prompt, + to_language=to_language, + ) + prompts.append(prompt) + return prompts, duration + ``` + +## Exposing tasks automatically + +vLLM automatically advertises transcription support if your model implements the interface: + +```python +if supports_transcription(model): + if model.supports_transcription_only: + return ["transcription"] + supported_tasks.append("transcription") +``` + +When enabled, the server initializes the transcription and translation handlers: + +```python +state.openai_serving_transcription = OpenAIServingTranscription(...) if "transcription" in supported_tasks else None +state.openai_serving_translation = OpenAIServingTranslation(...) if "transcription" in supported_tasks else None +``` + +No extra registration is required beyond having your model class available via the model registry and implementing `SupportsTranscription`. + +## Examples in-tree + +- Whisper encoder–decoder (audio-only): +- Voxtral decoder-only (audio embeddings + LLM): +- Gemma3n decoder-only with fixed instruction prompt: + +## Test with the API + +Once your model implements `SupportsTranscription`, you can test the endpoints (API mimics OpenAI): + +- Transcription (ASR): + + ```bash + curl -s -X POST \ + -H "Authorization: Bearer $VLLM_API_KEY" \ + -H "Content-Type: multipart/form-data" \ + -F "file=@/path/to/audio.wav" \ + -F "model=$MODEL_ID" \ + http://localhost:8000/v1/audio/transcriptions + ``` + +- Translation (source → English unless otherwise supported): + + ```bash + curl -s -X POST \ + -H "Authorization: Bearer $VLLM_API_KEY" \ + -H "Content-Type: multipart/form-data" \ + -F "file=@/path/to/audio.wav" \ + -F "model=$MODEL_ID" \ + http://localhost:8000/v1/audio/translations + ``` + +Or check out more examples in . + +!!! note + - If your model handles chunking internally (e.g., via its processor or encoder), set `min_energy_split_window_size=None` in the returned `SpeechToTextConfig` to disable server-side chunking. + - Implementing `get_num_audio_tokens` improves accuracy of streaming usage metrics (`prompt_tokens`) without an extra forward pass. + - For multilingual behavior, keep `supported_languages` aligned with actual model capabilities. diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md index 74627e906..5b83d9327 100644 --- a/docs/contributing/profiling.md +++ b/docs/contributing/profiling.md @@ -19,7 +19,7 @@ When using `vllm bench serve`, you can enable profiling by passing the `--profil Traces can be visualized using . !!! tip -You can directly call bench module without installing vllm using `python -m vllm.entrypoints.cli.main bench`. + You can directly call bench module without installing vLLM using `python -m vllm.entrypoints.cli.main bench`. !!! tip Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly. @@ -73,6 +73,8 @@ apt install nsight-systems-cli ### Example commands and usage +When profiling with `nsys`, it is advisable to set the environment variable `VLLM_WORKER_MULTIPROC_METHOD=spawn`. The default is to use the `fork` method instead of `spawn`. More information on the topic can be found in the [Nsight Systems release notes](https://docs.nvidia.com/nsight-systems/ReleaseNotes/index.html#general-issues). + #### Offline Inference For basic usage, you can just append `nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node` before any existing script you would run for offline inference. diff --git a/docs/deployment/frameworks/anything-llm.md b/docs/deployment/frameworks/anything-llm.md index 0b41e73b0..40a463a8a 100644 --- a/docs/deployment/frameworks/anything-llm.md +++ b/docs/deployment/frameworks/anything-llm.md @@ -1,41 +1,53 @@ -# Anything LLM +# AnythingLLM -[Anything LLM](https://github.com/Mintplex-Labs/anything-llm) is a full-stack application that enables you to turn any document, resource, or piece of content into context that any LLM can use as references during chatting. +[AnythingLLM](https://github.com/Mintplex-Labs/anything-llm) is a full-stack application that enables you to turn any document, resource, or piece of content into context that any LLM can use as references during chatting. It allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. ## Prerequisites -- Setup vLLM environment +Set up the vLLM environment: + +```bash +pip install vllm +``` ## Deploy -- Start the vLLM server with the supported chat completion model, e.g. +1. Start the vLLM server with a supported chat-completion model, for example: -```bash -vllm serve Qwen/Qwen1.5-32B-Chat-AWQ --max-model-len 4096 -``` + ```bash + vllm serve Qwen/Qwen1.5-32B-Chat-AWQ --max-model-len 4096 + ``` -- Download and install [Anything LLM desktop](https://anythingllm.com/desktop). +1. Download and install [AnythingLLM Desktop](https://anythingllm.com/desktop). -- On the bottom left of open settings, AI Providers --> LLM: - - LLM Provider: Generic OpenAI - - Base URL: http://{vllm server host}:{vllm server port}/v1 - - Chat Model Name: `Qwen/Qwen1.5-32B-Chat-AWQ` +1. Configure the AI provider: -![](../../assets/deployment/anything-llm-provider.png) + - At the bottom, click the 🔧 wrench icon -> **Open settings** -> **AI Providers** -> **LLM**. + - Enter the following values: + - LLM Provider: Generic OpenAI + - Base URL: `http://{vllm server host}:{vllm server port}/v1` + - Chat Model Name: `Qwen/Qwen1.5-32B-Chat-AWQ` -- Back to home page, New Workspace --> create `vllm` workspace, and start to chat: + ![set AI providers](../../assets/deployment/anything-llm-provider.png) -![](../../assets/deployment/anything-llm-chat-without-doc.png) +1. Create a workspace: -- Click the upload button: - - upload the doc - - select the doc and move to the workspace - - save and embed + 1. At the bottom, click the ↺ back icon and back to workspaces. + 1. Create a workspace (e.g., `vllm`) and start chatting. -![](../../assets/deployment/anything-llm-upload-doc.png) + ![create a workspace](../../assets/deployment/anything-llm-chat-without-doc.png) -- Chat again: +1. Add a document. -![](../../assets/deployment/anything-llm-chat-with-doc.png) + 1. Click the 📎 attachment icon. + 1. Upload a document. + 1. Select and move the document into your workspace. + 1. Save and embed it. + + ![add a document](../../assets/deployment/anything-llm-upload-doc.png) + +1. Chat using your document as context. + + ![chat with your context](../../assets/deployment/anything-llm-chat-with-doc.png) diff --git a/docs/deployment/frameworks/autogen.md b/docs/deployment/frameworks/autogen.md index c255a85d3..7517ee771 100644 --- a/docs/deployment/frameworks/autogen.md +++ b/docs/deployment/frameworks/autogen.md @@ -4,9 +4,7 @@ ## Prerequisites -- Setup vLLM environment - -- Setup [AutoGen](https://microsoft.github.io/autogen/0.2/docs/installation/) environment +Set up the vLLM and [AutoGen](https://microsoft.github.io/autogen/0.2/docs/installation/) environment: ```bash pip install vllm @@ -18,14 +16,14 @@ pip install -U "autogen-agentchat" "autogen-ext[openai]" ## Deploy -- Start the vLLM server with the supported chat completion model, e.g. +1. Start the vLLM server with the supported chat completion model, e.g. -```bash -python -m vllm.entrypoints.openai.api_server \ - --model mistralai/Mistral-7B-Instruct-v0.2 -``` + ```bash + python -m vllm.entrypoints.openai.api_server \ + --model mistralai/Mistral-7B-Instruct-v0.2 + ``` -- Call it with AutoGen: +1. Call it with AutoGen: ??? code diff --git a/docs/deployment/frameworks/chatbox.md b/docs/deployment/frameworks/chatbox.md index cbca6e628..002935da5 100644 --- a/docs/deployment/frameworks/chatbox.md +++ b/docs/deployment/frameworks/chatbox.md @@ -6,27 +6,31 @@ It allows you to deploy a large language model (LLM) server with vLLM as the bac ## Prerequisites -- Setup vLLM environment +Set up the vLLM environment: + +```bash +pip install vllm +``` ## Deploy -- Start the vLLM server with the supported chat completion model, e.g. +1. Start the vLLM server with the supported chat completion model, e.g. -```bash -vllm serve qwen/Qwen1.5-0.5B-Chat -``` + ```bash + vllm serve qwen/Qwen1.5-0.5B-Chat + ``` -- Download and install [Chatbox desktop](https://chatboxai.app/en#download). +1. Download and install [Chatbox desktop](https://chatboxai.app/en#download). -- On the bottom left of settings, Add Custom Provider +1. On the bottom left of settings, Add Custom Provider - API Mode: `OpenAI API Compatible` - Name: vllm - API Host: `http://{vllm server host}:{vllm server port}/v1` - API Path: `/chat/completions` - Model: `qwen/Qwen1.5-0.5B-Chat` -![](../../assets/deployment/chatbox-settings.png) + ![](../../assets/deployment/chatbox-settings.png) -- Go to `Just chat`, and start to chat: +1. Go to `Just chat`, and start to chat: -![](../../assets/deployment/chatbox-chat.png) + ![](../../assets/deployment/chatbox-chat.png) diff --git a/docs/deployment/frameworks/dify.md b/docs/deployment/frameworks/dify.md index 35f02c33c..820ef0cbe 100644 --- a/docs/deployment/frameworks/dify.md +++ b/docs/deployment/frameworks/dify.md @@ -8,44 +8,50 @@ This guide walks you through deploying Dify using a vLLM backend. ## Prerequisites -- Setup vLLM environment -- Install [Docker](https://docs.docker.com/engine/install/) and [Docker Compose](https://docs.docker.com/compose/install/) +Set up the vLLM environment: + +```bash +pip install vllm +``` + +And install [Docker](https://docs.docker.com/engine/install/) and [Docker Compose](https://docs.docker.com/compose/install/). ## Deploy -- Start the vLLM server with the supported chat completion model, e.g. +1. Start the vLLM server with the supported chat completion model, e.g. -```bash -vllm serve Qwen/Qwen1.5-7B-Chat -``` + ```bash + vllm serve Qwen/Qwen1.5-7B-Chat + ``` -- Start the Dify server with docker compose ([details](https://github.com/langgenius/dify?tab=readme-ov-file#quick-start)): +1. Start the Dify server with docker compose ([details](https://github.com/langgenius/dify?tab=readme-ov-file#quick-start)): -```bash -git clone https://github.com/langgenius/dify.git -cd dify -cd docker -cp .env.example .env -docker compose up -d -``` + ```bash + git clone https://github.com/langgenius/dify.git + cd dify + cd docker + cp .env.example .env + docker compose up -d + ``` -- Open the browser to access `http://localhost/install`, config the basic login information and login. +1. Open the browser to access `http://localhost/install`, config the basic login information and login. -- In the top-right user menu (under the profile icon), go to Settings, then click `Model Provider`, and locate the `vLLM` provider to install it. +1. In the top-right user menu (under the profile icon), go to Settings, then click `Model Provider`, and locate the `vLLM` provider to install it. + +1. Fill in the model provider details as follows: -- Fill in the model provider details as follows: - **Model Type**: `LLM` - **Model Name**: `Qwen/Qwen1.5-7B-Chat` - **API Endpoint URL**: `http://{vllm_server_host}:{vllm_server_port}/v1` - **Model Name for API Endpoint**: `Qwen/Qwen1.5-7B-Chat` - **Completion Mode**: `Completion` -![](../../assets/deployment/dify-settings.png) + ![](../../assets/deployment/dify-settings.png) -- To create a test chatbot, go to `Studio → Chatbot → Create from Blank`, then select Chatbot as the type: +1. To create a test chatbot, go to `Studio → Chatbot → Create from Blank`, then select Chatbot as the type: -![](../../assets/deployment/dify-create-chatbot.png) + ![](../../assets/deployment/dify-create-chatbot.png) -- Click the chatbot you just created to open the chat interface and start interacting with the model: +1. Click the chatbot you just created to open the chat interface and start interacting with the model: -![](../../assets/deployment/dify-chat.png) + ![](../../assets/deployment/dify-chat.png) diff --git a/docs/deployment/frameworks/haystack.md b/docs/deployment/frameworks/haystack.md index 70b4b48d4..836305cf1 100644 --- a/docs/deployment/frameworks/haystack.md +++ b/docs/deployment/frameworks/haystack.md @@ -6,7 +6,7 @@ It allows you to deploy a large language model (LLM) server with vLLM as the bac ## Prerequisites -- Setup vLLM and Haystack environment +Set up the vLLM and Haystack environment: ```bash pip install vllm haystack-ai @@ -14,13 +14,13 @@ pip install vllm haystack-ai ## Deploy -- Start the vLLM server with the supported chat completion model, e.g. +1. Start the vLLM server with the supported chat completion model, e.g. -```bash -vllm serve mistralai/Mistral-7B-Instruct-v0.1 -``` + ```bash + vllm serve mistralai/Mistral-7B-Instruct-v0.1 + ``` -- Use the `OpenAIGenerator` and `OpenAIChatGenerator` components in Haystack to query the vLLM server. +1. Use the `OpenAIGenerator` and `OpenAIChatGenerator` components in Haystack to query the vLLM server. ??? code diff --git a/docs/deployment/frameworks/litellm.md b/docs/deployment/frameworks/litellm.md index c7e514f22..0d6c37299 100644 --- a/docs/deployment/frameworks/litellm.md +++ b/docs/deployment/frameworks/litellm.md @@ -13,7 +13,7 @@ And LiteLLM supports all models on VLLM. ## Prerequisites -- Setup vLLM and litellm environment +Set up the vLLM and litellm environment: ```bash pip install vllm litellm @@ -23,13 +23,13 @@ pip install vllm litellm ### Chat completion -- Start the vLLM server with the supported chat completion model, e.g. +1. Start the vLLM server with the supported chat completion model, e.g. -```bash -vllm serve qwen/Qwen1.5-0.5B-Chat -``` + ```bash + vllm serve qwen/Qwen1.5-0.5B-Chat + ``` -- Call it with litellm: +1. Call it with litellm: ??? code @@ -51,13 +51,13 @@ vllm serve qwen/Qwen1.5-0.5B-Chat ### Embeddings -- Start the vLLM server with the supported embedding model, e.g. +1. Start the vLLM server with the supported embedding model, e.g. -```bash -vllm serve BAAI/bge-base-en-v1.5 -``` + ```bash + vllm serve BAAI/bge-base-en-v1.5 + ``` -- Call it with litellm: +1. Call it with litellm: ```python from litellm import embedding diff --git a/docs/deployment/frameworks/lobe-chat.md b/docs/deployment/frameworks/lobe-chat.md index e3e7dbe6e..8ecd1484e 100644 --- a/docs/deployment/frameworks/lobe-chat.md +++ b/docs/deployment/frameworks/lobe-chat.md @@ -6,6 +6,6 @@ Supports speech-synthesis, multi-modal, and extensible (function call) plugin sy One-click FREE deployment of your private OpenAI ChatGPT/Claude/Gemini/Groq/Ollama chat application. -It supports vLLM as a AI model provider to efficiently serve large language models. +It supports vLLM as an AI model provider to efficiently serve large language models. For details, see the tutorial [Using vLLM in LobeChat](https://lobehub.com/docs/usage/providers/vllm). diff --git a/docs/deployment/frameworks/lws.md b/docs/deployment/frameworks/lws.md index 3319dc6c9..3b9fa3ea4 100644 --- a/docs/deployment/frameworks/lws.md +++ b/docs/deployment/frameworks/lws.md @@ -22,7 +22,7 @@ Deploy the following yaml file `lws.yaml` metadata: name: vllm spec: - replicas: 2 + replicas: 1 leaderWorkerTemplate: size: 2 restartPolicy: RecreateGroupOnPodRestart @@ -41,7 +41,7 @@ Deploy the following yaml file `lws.yaml` - sh - -c - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); - python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2" + vllm serve meta-llama/Meta-Llama-3.1-405B-Instruct --port 8080 --tensor-parallel-size 8 --pipeline_parallel_size 2" resources: limits: nvidia.com/gpu: "8" @@ -126,8 +126,6 @@ Should get an output similar to this: NAME READY STATUS RESTARTS AGE vllm-0 1/1 Running 0 2s vllm-0-1 1/1 Running 0 2s -vllm-1 1/1 Running 0 2s -vllm-1-1 1/1 Running 0 2s ``` Verify that the distributed tensor-parallel inference works: diff --git a/docs/deployment/frameworks/retrieval_augmented_generation.md b/docs/deployment/frameworks/retrieval_augmented_generation.md index d5f2ec302..d86ab1600 100644 --- a/docs/deployment/frameworks/retrieval_augmented_generation.md +++ b/docs/deployment/frameworks/retrieval_augmented_generation.md @@ -11,7 +11,7 @@ Here are the integrations: ### Prerequisites -- Setup vLLM and langchain environment +Set up the vLLM and langchain environment: ```bash pip install -U vllm \ @@ -22,33 +22,33 @@ pip install -U vllm \ ### Deploy -- Start the vLLM server with the supported embedding model, e.g. +1. Start the vLLM server with the supported embedding model, e.g. -```bash -# Start embedding service (port 8000) -vllm serve ssmits/Qwen2-7B-Instruct-embed-base -``` + ```bash + # Start embedding service (port 8000) + vllm serve ssmits/Qwen2-7B-Instruct-embed-base + ``` -- Start the vLLM server with the supported chat completion model, e.g. +1. Start the vLLM server with the supported chat completion model, e.g. -```bash -# Start chat service (port 8001) -vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001 -``` + ```bash + # Start chat service (port 8001) + vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001 + ``` -- Use the script: +1. Use the script: -- Run the script +1. Run the script -```python -python retrieval_augmented_generation_with_langchain.py -``` + ```python + python retrieval_augmented_generation_with_langchain.py + ``` ## vLLM + llamaindex ### Prerequisites -- Setup vLLM and llamaindex environment +Set up the vLLM and llamaindex environment: ```bash pip install vllm \ @@ -60,24 +60,24 @@ pip install vllm \ ### Deploy -- Start the vLLM server with the supported embedding model, e.g. +1. Start the vLLM server with the supported embedding model, e.g. -```bash -# Start embedding service (port 8000) -vllm serve ssmits/Qwen2-7B-Instruct-embed-base -``` + ```bash + # Start embedding service (port 8000) + vllm serve ssmits/Qwen2-7B-Instruct-embed-base + ``` -- Start the vLLM server with the supported chat completion model, e.g. +1. Start the vLLM server with the supported chat completion model, e.g. -```bash -# Start chat service (port 8001) -vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001 -``` + ```bash + # Start chat service (port 8001) + vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001 + ``` -- Use the script: +1. Use the script: -- Run the script +1. Run the script: -```python -python retrieval_augmented_generation_with_llamaindex.py -``` + ```python + python retrieval_augmented_generation_with_llamaindex.py + ``` diff --git a/docs/deployment/integrations/llamastack.md b/docs/deployment/integrations/llamastack.md index 28031f01f..8eb7f8d81 100644 --- a/docs/deployment/integrations/llamastack.md +++ b/docs/deployment/integrations/llamastack.md @@ -1,6 +1,6 @@ # Llama Stack -vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-stack) . +vLLM is also available via [Llama Stack](https://github.com/llamastack/llama-stack). To install Llama Stack, run @@ -8,9 +8,9 @@ To install Llama Stack, run pip install llama-stack -q ``` -## Inference using OpenAI Compatible API +## Inference using OpenAI-Compatible API -Then start Llama Stack server pointing to your vLLM server with the following configuration: +Then start the Llama Stack server and configure it to point to your vLLM server with the following settings: ```yaml inference: @@ -20,15 +20,15 @@ inference: url: http://127.0.0.1:8000 ``` -Please refer to [this guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/remote-vllm.html) for more details on this remote vLLM provider. +Please refer to [this guide](https://llama-stack.readthedocs.io/en/latest/providers/inference/remote_vllm.html) for more details on this remote vLLM provider. -## Inference via Embedded vLLM +## Inference using Embedded vLLM -An [inline vLLM provider](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/inline/inference/vllm) +An [inline provider](https://github.com/llamastack/llama-stack/tree/main/llama_stack/providers/inline/inference) is also available. This is a sample of configuration using that method: ```yaml -inference +inference: - provider_type: vllm config: model: Llama3.1-8B-Instruct diff --git a/docs/deployment/k8s.md b/docs/deployment/k8s.md index cad801a43..ca23e0b9f 100644 --- a/docs/deployment/k8s.md +++ b/docs/deployment/k8s.md @@ -380,7 +380,7 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) ### Startup Probe or Readiness Probe Failure, container log contains "KeyboardInterrupt: terminated" -If the startup or readiness probe failureThreshold is too low for the time needed to startup the server, Kubernetes scheduler will kill the container. A couple of indications that this has happened: +If the startup or readiness probe failureThreshold is too low for the time needed to start up the server, Kubernetes scheduler will kill the container. A couple of indications that this has happened: 1. container log contains "KeyboardInterrupt: terminated" 2. `kubectl get events` shows message `Container $NAME failed startup probe, will be restarted` diff --git a/docs/design/fused_moe_modular_kernel.md b/docs/design/fused_moe_modular_kernel.md index 202e9c1ca..cb2037b57 100644 --- a/docs/design/fused_moe_modular_kernel.md +++ b/docs/design/fused_moe_modular_kernel.md @@ -54,8 +54,8 @@ The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEPermuteExperts ### FusedMoEPrepareAndFinalize -The `FusedMoEPrepareAndFinalize` abstract class exposes `prepare` and `finalize` functions. -The `prepare` function is responsible for input activation Quantization and All2All Dispatch. The `finalize` function is responsible for invoking the All2All Combine. Additionally the `finalize` function may or may not do the TopK weight application and reduction (Please refer to the TopKWeightAndReduce section) +The `FusedMoEPrepareAndFinalize` abstract class exposes `prepare`, `prepare_no_receive` and `finalize` functions. +The `prepare` function is responsible for input activation Quantization and All2All Dispatch. If implemented, The `prepare_no_receive` is like `prepare` except it does not wait to receive results from other workers. Instead it returns a "receiver" callback that must be invoked to wait for the final results of worker. It is not required that this method is supported by all `FusedMoEPrepareAndFinalize` classes, but if it is available, it can be used to interleave work with the initial all to all communication, e.g. interleaving shared experts with fused experts. The `finalize` function is responsible for invoking the All2All Combine. Additionally the `finalize` function may or may not do the TopK weight application and reduction (Please refer to the TopKWeightAndReduce section) ![](../assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png "FusedMoEPrepareAndFinalize Blocks") @@ -138,7 +138,7 @@ Typically a FusedMoEPrepareAndFinalize type is backed by an All2All Dispatch & C #### Step 1: Add an All2All manager -The purpose of the All2All Manager is to setup the All2All kernel implementations. The `FusedMoEPrepareAndFinalize` implementations typically fetch a kernel-implementation "handle" from the All2All Manager to invoke the Dispatch and Combine functions. Please look at the All2All Manager implementations [here](gh-file:vllm/distributed/device_communicators/all2all.py). +The purpose of the All2All Manager is to set up the All2All kernel implementations. The `FusedMoEPrepareAndFinalize` implementations typically fetch a kernel-implementation "handle" from the All2All Manager to invoke the Dispatch and Combine functions. Please look at the All2All Manager implementations [here](gh-file:vllm/distributed/device_communicators/all2all.py). #### Step 2: Add a FusedMoEPrepareAndFinalize Type @@ -146,6 +146,10 @@ This section describes the significance of the various functions exposed by the `FusedMoEPrepareAndFinalize::prepare()`: The prepare method implements the Quantization and All2All Dispatch. Typically the Dispatch function from the relevant All2All Manager is invoked. +`FusedMoEPrepareAndFinalize::has_prepare_no_receive()`: Indicates whether or not this subclass implements `prepare_no_receive`. Defaults to False. + +`FusedMoEPrepareAndFinalize::prepare_no_receive()`: The prepare_no_receive method implements the Quantization and All2All Dispatch. It does not wait for the result of the dispatch operation but instead returns a thunk that can be invoked to wait for the final results. Typically the Dispatch function from the relevant All2All Manager is invoked. + `FusedMoEPrepareAndFinalize::finalize()`: Maybe perform TopK Weight Application and Reduction and All2All Combine. Typically the Combine function from the relevant All2AllManager is invoked. `FusedMoEPrepareAndFinalize::activation_format()`: Return `FusedMoEActivationFormat.BatchedExperts` if the output of the prepare method (i.e. the All2All dispatch) is Batched. Return `FusedMoEActivationFormat.Standard` otherwise. diff --git a/docs/design/io_processor_plugins.md b/docs/design/io_processor_plugins.md new file mode 100644 index 000000000..e70ee4a07 --- /dev/null +++ b/docs/design/io_processor_plugins.md @@ -0,0 +1,78 @@ +# IO Processor Plugins + +IO Processor plugins are a feature that allows pre and post processing of the model input and output for pooling models. The idea is that users are allowed to pass a custom input to vLLM that is converted into one or more model prompts and fed to the model `encode` method. One potential use-case of such plugins is that of using vLLM for generating multi-modal data. Say users feed an image to vLLM and get an image in output. + +When performing an inference with IO Processor plugins, the prompt type is defined by the plugin and the same is valid for the final request output. vLLM does not perform any validation of input/output data, and it is up to the plugin to ensure the correct data is being fed to the model and returned to the user. As of now these plugins support only pooling models and can be triggered via the `encode` method in `LLM` and `AsyncLLM`, or in online serving mode via the `/pooling` endpoint. + +## Writing an IO Processor Plugin + +IO Processor plugins implement the `IOProcessor` interface (): + +```python +IOProcessorInput = TypeVar('IOProcessorInput') +IOProcessorOutput = TypeVar('IOProcessorOutput') + +class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]): + + def __init__(self, vllm_config: VllmConfig): + self.vllm_config = vllm_config + + @abstractmethod + def pre_process( + self, + prompt: IOProcessorInput, + request_id: Optional[str] = None, + **kwargs, + ) -> Union[PromptType, Sequence[PromptType]]: + raise NotImplementedError + + async def pre_process_async( + self, + prompt: IOProcessorInput, + request_id: Optional[str] = None, + **kwargs, + ) -> Union[PromptType, Sequence[PromptType]]: + return self.pre_process(prompt, request_id, **kwargs) + + @abstractmethod + def post_process(self, + model_output: Sequence[PoolingRequestOutput], + request_id: Optional[str] = None, + **kwargs) -> IOProcessorOutput: + raise NotImplementedError + + async def post_process_async( + self, + model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]], + request_id: Optional[str] = None, + **kwargs, + ) -> IOProcessorOutput: + collected_output = [item async for i, item in model_output] + return self.post_process(collected_output, request_id, **kwargs) + + @abstractmethod + def parse_request(self, request: Any) -> IOProcessorInput: + raise NotImplementedError + + @abstractmethod + def output_to_response( + self, plugin_output: IOProcessorOutput) -> IOProcessorResponse: + raise NotImplementedError +``` + +The `parse_request` method is used for validating the user prompt and converting it into the input expected by the `pre_process`/`pre_process_async` methods. +The `pre_process*` methods take the validated plugin input to generate vLLM's model prompts for regular inference. +The `post_process*` methods take `PoolingRequestOutput` objects as input and generate a custom plugin output. + +The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/io_processor_pooling` serving endpoint is available here . + +An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/christian-pinto/prithvi_io_processor_plugin). Please, also refer to our online () and offline () inference examples. + +## Using an IO Processor plugin + +IO Processor plugins are loaded at engine startup and there are two methods for specifying the name of the plugin to be loaded: + +1. Via vLLM's `EngineArgs`: setting the `io_processor_plugin` argument in the `EngineArgs` used to initialize the `AsyncLLM`. The same can be achieved by passing the `io_processor_plugin` argument to `LLM` in offline mode, or by passing the `--io-processor-plugin` argument in serving mode. +2. Via the model HF configuration: adding an `io_processor_plugin` field to the model config (config.json). + +The order also determines method priority. i.e., setting the plugin name via `EngineArgs` will override any plugin name specified in the model HF config (config.json). diff --git a/docs/design/metrics.md b/docs/design/metrics.md index b24364247..90b2fd32f 100644 --- a/docs/design/metrics.md +++ b/docs/design/metrics.md @@ -99,11 +99,11 @@ http_request_duration_seconds_count{handler="/v1/completions",method="POST"} 201 ### Multi-process Mode -In v0, metrics are collected in the engine core process and we use multi-process mode to make them available in the API server process. See . +In v0, metrics are collected in the engine core process and we use multiprocess mode to make them available in the API server process. See . ### Built in Python/Process Metrics -The following metrics are supported by default by `prometheus_client`, but they are not exposed when multi-process mode is used: +The following metrics are supported by default by `prometheus_client`, but they are not exposed when multiprocess mode is used: - `python_gc_objects_collected_total` - `python_gc_objects_uncollectable_total` diff --git a/docs/design/multiprocessing.md b/docs/design/multiprocessing.md index 247072d1c..6e92b20d2 100644 --- a/docs/design/multiprocessing.md +++ b/docs/design/multiprocessing.md @@ -8,7 +8,7 @@ page for information on known issues and how to solve them. ## Introduction !!! important - The source code references are to the state of the code at the time of writing in December, 2024. + The source code references are to the state of the code at the time of writing in December 2024. The use of Python multiprocessing in vLLM is complicated by: diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md index ca1c2c230..371938097 100644 --- a/docs/design/plugin_system.md +++ b/docs/design/plugin_system.md @@ -49,6 +49,8 @@ Every plugin has three parts: - **Platform plugins** (with group name `vllm.platform_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree platforms into vLLM. The plugin function should return `None` when the platform is not supported in the current environment, or the platform class's fully qualified name when the platform is supported. +- **IO Processor plugins** (with group name `vllm.io_processor_plugins`): The primary use case for these plugins is to register custom pre/post processing of the model prompt and model output for poling models. The plugin function returns the IOProcessor's class fully qualified name. + ## Guidelines for Writing Plugins - **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes. diff --git a/docs/features/compatibility_matrix.md b/docs/features/README.md similarity index 98% rename from docs/features/compatibility_matrix.md rename to docs/features/README.md index 5b08b3810..de23cd0a9 100644 --- a/docs/features/compatibility_matrix.md +++ b/docs/features/README.md @@ -1,4 +1,6 @@ -# Compatibility Matrix +# Features + +## Compatibility Matrix The tables below show mutually exclusive features and the support on some hardware. @@ -12,7 +14,7 @@ The symbols used have the following meanings: !!! note Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/hardware combination. -## Feature x Feature +### Feature x Feature