Compare commits
223 Commits
v0.18.0rc0
...
v0.18.1rc0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
298e510848 | ||
|
|
3982bc2cd0 | ||
|
|
02eec7ecbe | ||
|
|
17ee641c45 | ||
|
|
0d50fa1db6 | ||
|
|
1fa1e53a73 | ||
|
|
3ffa52009f | ||
|
|
87bd91892f | ||
|
|
c7f98b4d0a | ||
|
|
1c472f8fe1 | ||
|
|
c57d38d603 | ||
|
|
e5ed6c6c13 | ||
|
|
b3d0b37908 | ||
|
|
85f671b8e1 | ||
|
|
8bc6b5cdb0 | ||
|
|
4f16ebbbd3 | ||
|
|
12fd17eb51 | ||
|
|
37aadf6237 | ||
|
|
d7d2b5e405 | ||
|
|
6ec5e9fd37 | ||
|
|
e1d85e5c24 | ||
|
|
79eb9369c5 | ||
|
|
e80cfe575d | ||
|
|
d0532bf38d | ||
|
|
fb4e8bf442 | ||
|
|
6ade4bc5a5 | ||
|
|
2e089b96a8 | ||
|
|
880be2b1b8 | ||
|
|
c0f5fae601 | ||
|
|
aa84e43ccb | ||
|
|
5e806bcf54 | ||
|
|
56a62c310c | ||
|
|
1779c09898 | ||
|
|
44eea10f68 | ||
|
|
8b6c6b9505 | ||
|
|
9f6d9dd371 | ||
|
|
dd20ee4e3e | ||
|
|
0523449c9c | ||
|
|
b4c1aef21c | ||
|
|
6050b93bed | ||
|
|
5a4a179591 | ||
|
|
37cd9fc107 | ||
|
|
9cfd4ebb5e | ||
|
|
ed359c497a | ||
|
|
dcee9be95a | ||
|
|
bd8c4c0752 | ||
|
|
0140eafb15 | ||
|
|
bdf6a0a57b | ||
|
|
0674d1fee7 | ||
|
|
30108fc8b0 | ||
|
|
e2d1c8b5e8 | ||
|
|
6951fcd44f | ||
|
|
39474513f6 | ||
|
|
638a872d77 | ||
|
|
9040151fe1 | ||
|
|
8fbe3f303f | ||
|
|
ea2c148fa7 | ||
|
|
47b7af0d87 | ||
|
|
269bf46d99 | ||
|
|
e5a77a5015 | ||
|
|
ca1ac1a4b4 | ||
|
|
4ca3fa6bb4 | ||
|
|
be12afd284 | ||
|
|
df3c0291a3 | ||
|
|
2be1a0f74b | ||
|
|
4120a05ff1 | ||
|
|
98ff042917 | ||
|
|
b55156eae9 | ||
|
|
112944fab9 | ||
|
|
91be5f9be3 | ||
|
|
4ee847e400 | ||
|
|
040a505ff5 | ||
|
|
9279c59a0e | ||
|
|
7454096199 | ||
|
|
fb8b5e05fc | ||
|
|
e5d96dc8fc | ||
|
|
daa05bf340 | ||
|
|
7769b58307 | ||
|
|
2f9f946b22 | ||
|
|
2890aecce5 | ||
|
|
34f093b417 | ||
|
|
4dce8321a9 | ||
|
|
657855ab41 | ||
|
|
e27b8ba3d1 | ||
|
|
40b8363b45 | ||
|
|
8b10e4fb31 | ||
|
|
104605cbf2 | ||
|
|
96266f119b | ||
|
|
7c0cf3bcd0 | ||
|
|
572b432913 | ||
|
|
9515c20868 | ||
|
|
c63ca2b2e6 | ||
|
|
a32eaf5bb2 | ||
|
|
e390742c59 | ||
|
|
7a6ebcbfcf | ||
|
|
c7bc12c20f | ||
|
|
f9e2a38386 | ||
|
|
4426447bba | ||
|
|
3322e26420 | ||
|
|
765e461065 | ||
|
|
6a9cceb219 | ||
|
|
199f914183 | ||
|
|
ca21483bf9 | ||
|
|
da70c87e81 | ||
|
|
0b6d52629f | ||
|
|
d3cc379567 | ||
|
|
354cd580d5 | ||
|
|
d49f273144 | ||
|
|
b21d384304 | ||
|
|
e3126cd107 | ||
|
|
e37ff5b5c8 | ||
|
|
6accb21f2a | ||
|
|
053f3b6309 | ||
|
|
5f82706a21 | ||
|
|
c32a58cc2a | ||
|
|
ef2c4f778d | ||
|
|
9dade5da3a | ||
|
|
828f862acb | ||
|
|
577df69b26 | ||
|
|
04244fd0e1 | ||
|
|
9482b0b085 | ||
|
|
5bc1da147f | ||
|
|
0091017188 | ||
|
|
0d81a1fe61 | ||
|
|
6ae4c8d6fc | ||
|
|
a913b612d8 | ||
|
|
5ce2d10e4a | ||
|
|
738d0a281f | ||
|
|
70b81c4f3d | ||
|
|
7476d148db | ||
|
|
f3732bd931 | ||
|
|
0ef7f79054 | ||
|
|
5dd8df0701 | ||
|
|
39bfb57b7c | ||
|
|
c9d838fc33 | ||
|
|
b1169d7be8 | ||
|
|
17808394bc | ||
|
|
296839a1b0 | ||
|
|
c373b5c00d | ||
|
|
de1a86b7de | ||
|
|
99267c23ca | ||
|
|
525f2eeb0b | ||
|
|
918b7890a1 | ||
|
|
98b09ddc27 | ||
|
|
cef1f302d2 | ||
|
|
17c47fb869 | ||
|
|
b322b197f1 | ||
|
|
eaf7c9b976 | ||
|
|
47a1f11bff | ||
|
|
fad09e8a1f | ||
|
|
8c31f47c63 | ||
|
|
261801242f | ||
|
|
fcf0687b27 | ||
|
|
86b7e3c95a | ||
|
|
0e95916155 | ||
|
|
ce2ef42fd3 | ||
|
|
8b6325758c | ||
|
|
a0dd1995c7 | ||
|
|
f1740006e4 | ||
|
|
58cde5c026 | ||
|
|
761e0aa7a0 | ||
|
|
ff9fbc9aff | ||
|
|
e6c4797704 | ||
|
|
09e4576f65 | ||
|
|
3ed7b1e6e0 | ||
|
|
e8f9dbc369 | ||
|
|
de35c06c66 | ||
|
|
c0745a851a | ||
|
|
b5ca9c3557 | ||
|
|
245758992e | ||
|
|
1204cf0a9d | ||
|
|
b36adfa349 | ||
|
|
e78821b438 | ||
|
|
51f0acda79 | ||
|
|
fa75204b16 | ||
|
|
bdb903bb5f | ||
|
|
68f783a727 | ||
|
|
c5030c439d | ||
|
|
51b2333be1 | ||
|
|
4ed51308c8 | ||
|
|
c781fbbab3 | ||
|
|
979ff44cea | ||
|
|
f63ed7b5ac | ||
|
|
c9e5096256 | ||
|
|
2ff0ad9694 | ||
|
|
a836524d20 | ||
|
|
3717a4dd47 | ||
|
|
ecfcdd2ce4 | ||
|
|
c25dbc2d27 | ||
|
|
77d2a5f17b | ||
|
|
59192dfd39 | ||
|
|
56cb1baa66 | ||
|
|
f340324335 | ||
|
|
2660b9289c | ||
|
|
293f036e6d | ||
|
|
0fb142a454 | ||
|
|
00f8e0d211 | ||
|
|
4af9ed21cb | ||
|
|
9c7cab5ebb | ||
|
|
132bfd45b6 | ||
|
|
24b4272a8c | ||
|
|
8a680463fa | ||
|
|
20b14095a4 | ||
|
|
17c1bdf371 | ||
|
|
3e3d320c1b | ||
|
|
54a62a79f7 | ||
|
|
384dc7f77b | ||
|
|
f04d5226f8 | ||
|
|
0a0a1a198b | ||
|
|
6c1cfbad32 | ||
|
|
45f526d652 | ||
|
|
5db91f0aaf | ||
|
|
061980c36a | ||
|
|
7a49742b88 | ||
|
|
3e6a1e1686 | ||
|
|
7961486a9b | ||
|
|
4f9b14c21c | ||
|
|
31a458c091 | ||
|
|
a3a51d20e7 | ||
|
|
e5b807607c | ||
|
|
fd4d96302a | ||
|
|
c0f011918d | ||
|
|
e6ae4b1be1 |
@@ -0,0 +1 @@
|
||||
Qwen3-235B-A22B-Instruct-2507-FP8.yaml
|
||||
@@ -16,6 +16,23 @@ RAY_BASE_URL="https://raw.githubusercontent.com/ray-project/ray/master/python"
|
||||
WORK_DIR=$(mktemp -d)
|
||||
trap 'rm -rf "$WORK_DIR"' EXIT
|
||||
|
||||
# ── Detect PyTorch index URL ─────────────────────────────────────────────
|
||||
|
||||
if python3 -c "import torch; assert torch.version.hip" 2>/dev/null; then
|
||||
ROCM_VER=$(python3 -c "import torch; print(torch.version.hip.rsplit('.', 1)[0])")
|
||||
CANDIDATE_URL="https://download.pytorch.org/whl/rocm${ROCM_VER}"
|
||||
if curl -fsSL --head "${CANDIDATE_URL}/" >/dev/null 2>&1; then
|
||||
TORCH_INDEX_URL="${CANDIDATE_URL}"
|
||||
else
|
||||
echo ">>> WARNING: ROCm ${ROCM_VER} wheel index not found at ${CANDIDATE_URL}"
|
||||
echo ">>> Falling back to default PyPI (resolution may be incomplete)"
|
||||
TORCH_INDEX_URL=""
|
||||
fi
|
||||
else
|
||||
TORCH_INDEX_URL="https://download.pytorch.org/whl/cu129"
|
||||
fi
|
||||
echo ">>> Using PyTorch index: ${TORCH_INDEX_URL:-PyPI default}"
|
||||
|
||||
# Fetch all Ray requirement files used in the LLM depset pipeline
|
||||
echo ">>> Fetching Ray requirement files"
|
||||
RAY_FILES=(
|
||||
@@ -116,6 +133,11 @@ echo "============================================================"
|
||||
echo ">>> Resolving: Can Ray generate compatible lock files?"
|
||||
echo "============================================================"
|
||||
|
||||
EXTRA_INDEX_ARGS=()
|
||||
if [[ -n "${TORCH_INDEX_URL}" ]]; then
|
||||
EXTRA_INDEX_ARGS+=(--extra-index-url "${TORCH_INDEX_URL}")
|
||||
fi
|
||||
|
||||
set +e
|
||||
uv pip compile \
|
||||
"${WORK_DIR}/requirements.txt" \
|
||||
@@ -126,7 +148,7 @@ uv pip compile \
|
||||
-c "${WORK_DIR}/vllm-constraints.txt" \
|
||||
--python-version 3.12 \
|
||||
--python-platform x86_64-manylinux_2_31 \
|
||||
--extra-index-url https://download.pytorch.org/whl/cu129 \
|
||||
"${EXTRA_INDEX_ARGS[@]}" \
|
||||
--index-strategy unsafe-best-match \
|
||||
--unsafe-package setuptools \
|
||||
--unsafe-package ray \
|
||||
|
||||
@@ -333,15 +333,18 @@ apply_rocm_test_overrides() {
|
||||
# --- Entrypoint ignores ---
|
||||
if [[ $cmds == *" entrypoints/openai "* ]]; then
|
||||
cmds=${cmds//" entrypoints/openai "/" entrypoints/openai \
|
||||
--ignore=entrypoints/openai/test_audio.py \
|
||||
--ignore=entrypoints/openai/test_shutdown.py \
|
||||
--ignore=entrypoints/openai/chat_completion/test_audio.py \
|
||||
--ignore=entrypoints/openai/completion/test_shutdown.py \
|
||||
--ignore=entrypoints/openai/test_completion.py \
|
||||
--ignore=entrypoints/openai/test_models.py \
|
||||
--ignore=entrypoints/openai/test_lora_adapters.py \
|
||||
--ignore=entrypoints/openai/models/test_models.py \
|
||||
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
|
||||
--ignore=entrypoints/openai/test_root_path.py \
|
||||
--ignore=entrypoints/openai/test_tokenization.py \
|
||||
--ignore=entrypoints/openai/test_prompt_validation.py "}
|
||||
--ignore=entrypoints/openai/chat_completion/test_root_path.py \
|
||||
--ignore=entrypoints/openai/completion/test_prompt_validation.py "}
|
||||
fi
|
||||
|
||||
if [[ $cmds == *" entrypoints/serve"* ]]; then
|
||||
cmds="${cmds} \
|
||||
--ignore=entrypoints/serve/lora/test_lora_adapters.py"
|
||||
fi
|
||||
|
||||
if [[ $cmds == *" entrypoints/llm "* ]]; then
|
||||
|
||||
@@ -127,7 +127,7 @@ run_and_track_test() {
|
||||
|
||||
# --- Actual Test Execution ---
|
||||
run_and_track_test 1 "test_struct_output_generate.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
|
||||
run_and_track_test 2 "test_moe_pallas.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
|
||||
run_and_track_test 3 "test_lora.py" \
|
||||
|
||||
@@ -33,23 +33,22 @@ docker run \
|
||||
bash -c '
|
||||
set -e
|
||||
echo $ZE_AFFINITY_MASK
|
||||
pip install tblib==3.1.0
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
|
||||
python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager
|
||||
python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager --max-model-len 8192
|
||||
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
|
||||
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
|
||||
cd tests
|
||||
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
|
||||
pytest -v -s v1/engine
|
||||
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py --ignore=v1/worker/test_worker_memory_snapshot.py
|
||||
pytest -v -s v1/structured_output
|
||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py
|
||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
|
||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py -k "not (test_register_kv_caches and FLASH_ATTN and True)"
|
||||
pytest -v -s v1/test_serial_utils.py
|
||||
'
|
||||
|
||||
@@ -1,11 +1,14 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euxo pipefail
|
||||
|
||||
# Nightly e2e test for prefetch offloading with a MoE model.
|
||||
# Runs DeepSeek-V2-Lite with prefetch offloading of MoE expert weights
|
||||
# and validates GSM8K accuracy matches baseline (no offloading).
|
||||
#
|
||||
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
||||
#
|
||||
# Environment variables:
|
||||
# ATTENTION_BACKEND - attention backend to use (e.g., FLASH_ATTN,
|
||||
# ROCM_ATTN, FLASHINFER). If unset, uses vllm default.
|
||||
THRESHOLD=${1:-0.25}
|
||||
NUM_Q=${2:-1319}
|
||||
PORT=${3:-8030}
|
||||
@@ -22,6 +25,14 @@ wait_for_server() {
|
||||
|
||||
MODEL="deepseek-ai/DeepSeek-V2-Lite"
|
||||
|
||||
# ── Build optional vllm serve flags ─────────────────────────────────────
|
||||
|
||||
EXTRA_ARGS=()
|
||||
if [[ -n "${ATTENTION_BACKEND:-}" ]]; then
|
||||
echo "Using attention backend: ${ATTENTION_BACKEND}"
|
||||
EXTRA_ARGS+=(--attention-backend "${ATTENTION_BACKEND}")
|
||||
fi
|
||||
|
||||
cleanup() {
|
||||
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
||||
kill "${SERVER_PID}" 2>/dev/null || true
|
||||
@@ -40,7 +51,8 @@ vllm serve "$MODEL" \
|
||||
--offload-num-in-group 2 \
|
||||
--offload-prefetch-step 1 \
|
||||
--offload-params w13_weight w2_weight \
|
||||
--port "$PORT" &
|
||||
--port "$PORT" \
|
||||
${EXTRA_ARGS+"${EXTRA_ARGS[@]}"} &
|
||||
SERVER_PID=$!
|
||||
wait_for_server "$PORT"
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -59,7 +59,7 @@ steps:
|
||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||
- pytest -s -v tests/compile/passes/distributed
|
||||
|
||||
- label: Fusion and Compile Unit Tests (B200)
|
||||
- label: Fusion and Compile Unit Tests (2xB200)
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: b200
|
||||
|
||||
@@ -15,8 +15,29 @@ steps:
|
||||
- pytest -v -s distributed/test_shm_buffer.py
|
||||
- pytest -v -s distributed/test_shm_storage.py
|
||||
|
||||
- label: Distributed (2 GPUs)
|
||||
timeout_in_minutes: 60
|
||||
- label: Distributed DP Tests (2 GPUs)
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- vllm/engine/
|
||||
- vllm/executor/
|
||||
- vllm/worker/worker_base.py
|
||||
- vllm/v1/engine/
|
||||
- vllm/v1/worker/
|
||||
- tests/v1/distributed
|
||||
- tests/entrypoints/openai/test_multi_api_servers.py
|
||||
commands:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||
- DP_SIZE=2 pytest -v -s entrypoints/openai/test_multi_api_servers.py
|
||||
|
||||
- label: Distributed Compile + RPC Tests (2 GPUs)
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
@@ -29,22 +50,31 @@ steps:
|
||||
- vllm/v1/worker/
|
||||
- tests/compile/fullgraph/test_basic_correctness.py
|
||||
- tests/compile/test_wrapper.py
|
||||
- tests/distributed/
|
||||
- tests/entrypoints/llm/test_collective_rpc.py
|
||||
- tests/v1/distributed
|
||||
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
||||
commands:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
|
||||
- label: Distributed Torchrun + Shutdown Tests (2 GPUs)
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- vllm/engine/
|
||||
- vllm/executor/
|
||||
- vllm/worker/worker_base.py
|
||||
- vllm/v1/engine/
|
||||
- vllm/v1/worker/
|
||||
- tests/distributed/
|
||||
- tests/v1/shutdown
|
||||
- tests/v1/worker/test_worker_memory_snapshot.py
|
||||
commands:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||
@@ -52,41 +82,35 @@ steps:
|
||||
|
||||
- label: Distributed Torchrun + Examples (4 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
working_dir: "/vllm-workspace"
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- tests/distributed/test_torchrun_example.py
|
||||
- tests/distributed/test_torchrun_example_moe.py
|
||||
- examples/offline_inference/rlhf.py
|
||||
- examples/offline_inference/rlhf_colocate.py
|
||||
- examples/offline_inference/new_weight_syncing/
|
||||
- examples/rl/
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
commands:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
# test with torchrun tp=2 and external_dp=2
|
||||
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
- torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example.py
|
||||
# test with torchrun tp=2 and pp=2
|
||||
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
- PP_SIZE=2 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example.py
|
||||
# test with torchrun tp=4 and dp=1
|
||||
- TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||
- TP_SIZE=4 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py
|
||||
# test with torchrun tp=2, pp=2 and dp=1
|
||||
- PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||
- PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py
|
||||
# test with torchrun tp=1 and dp=4 with ep
|
||||
- DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||
- DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py
|
||||
# test with torchrun tp=2 and dp=2 with ep
|
||||
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py
|
||||
# test with internal dp
|
||||
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
||||
# OLD rlhf examples
|
||||
- cd ../examples/offline_inference
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||
# NEW rlhf examples
|
||||
- cd new_weight_syncing
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
|
||||
- python3 examples/offline_inference/data_parallel.py --enforce-eager
|
||||
# rlhf examples
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_nccl.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_ipc.py
|
||||
|
||||
- label: Distributed DP Tests (4 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
@@ -169,7 +193,7 @@ steps:
|
||||
num_devices: 2
|
||||
commands:
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_async_new_apis.py
|
||||
- VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
|
||||
@@ -70,3 +70,15 @@ steps:
|
||||
device: mi325_4
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
- label: V1 e2e (4xH100)
|
||||
timeout_in_minutes: 60
|
||||
device: h100
|
||||
num_devices: 4
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/v1/attention/backends/utils.py
|
||||
- vllm/v1/worker/gpu_model_runner.py
|
||||
- tests/v1/e2e/test_hybrid_chunked_prefill.py
|
||||
commands:
|
||||
- pytest -v -s v1/e2e/test_hybrid_chunked_prefill.py
|
||||
|
||||
@@ -10,7 +10,7 @@ steps:
|
||||
- tests/entrypoints/
|
||||
commands:
|
||||
- pytest -v -s entrypoints/openai/tool_parsers
|
||||
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
|
||||
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/serve/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
|
||||
|
||||
- label: Entrypoints Integration (LLM)
|
||||
timeout_in_minutes: 40
|
||||
@@ -34,7 +34,7 @@ steps:
|
||||
- tests/entrypoints/test_chat_utils
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses --ignore=entrypoints/openai/test_multi_api_servers.py
|
||||
- pytest -v -s entrypoints/test_chat_utils.py
|
||||
mirror:
|
||||
amd:
|
||||
@@ -48,11 +48,11 @@ steps:
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/entrypoints/rpc
|
||||
- tests/entrypoints/instrumentator
|
||||
- tests/entrypoints/serve/instrumentator
|
||||
- tests/tool_use
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/instrumentator
|
||||
- pytest -v -s entrypoints/serve/instrumentator
|
||||
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
|
||||
- pytest -v -s tool_use
|
||||
|
||||
@@ -75,19 +75,6 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s entrypoints/openai/responses
|
||||
|
||||
- label: Entrypoints V1
|
||||
timeout_in_minutes: 50
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
commands:
|
||||
- pytest -v -s v1/entrypoints
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
- label: OpenAI API Correctness
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
|
||||
@@ -24,8 +24,7 @@ steps:
|
||||
|
||||
- label: Elastic EP Scaling Test
|
||||
timeout_in_minutes: 20
|
||||
device: b200
|
||||
optional: true
|
||||
device: h100
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
|
||||
@@ -35,7 +35,7 @@ steps:
|
||||
parallelism: 2
|
||||
|
||||
- label: Kernels MoE Test %N
|
||||
timeout_in_minutes: 60
|
||||
timeout_in_minutes: 25
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/cutlass_w8a8/moe/
|
||||
- csrc/moe/
|
||||
@@ -47,7 +47,7 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s kernels/moe --ignore=kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
- pytest -v -s kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 2
|
||||
parallelism: 5
|
||||
|
||||
- label: Kernels Mamba Test
|
||||
timeout_in_minutes: 45
|
||||
|
||||
@@ -45,6 +45,22 @@ steps:
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
|
||||
|
||||
- label: LM Eval Qwen3.5 Models (B200)
|
||||
timeout_in_minutes: 120
|
||||
device: b200
|
||||
optional: true
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/models/qwen3_5.py
|
||||
- vllm/model_executor/models/qwen3_5_mtp.py
|
||||
- vllm/transformers_utils/configs/qwen3_5.py
|
||||
- vllm/transformers_utils/configs/qwen3_5_moe.py
|
||||
- vllm/model_executor/models/qwen3_next.py
|
||||
- vllm/model_executor/models/qwen3_next_mtp.py
|
||||
- vllm/model_executor/layers/fla/ops/
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-qwen35-blackwell.txt
|
||||
|
||||
- label: LM Eval Large Models (H200)
|
||||
timeout_in_minutes: 60
|
||||
device: h200
|
||||
|
||||
@@ -8,7 +8,7 @@ steps:
|
||||
- vllm/lora
|
||||
- tests/lora
|
||||
commands:
|
||||
- pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py
|
||||
- pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py --ignore=lora/test_qwen35_densemoel_lora.py
|
||||
parallelism: 4
|
||||
|
||||
|
||||
@@ -30,4 +30,5 @@ steps:
|
||||
- pytest -v -s -x lora/test_llama_tp.py
|
||||
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
||||
- pytest -v -s -x lora/test_olmoe_tp.py
|
||||
- pytest -v -s -x lora/test_gptoss_tp.py
|
||||
- pytest -v -s -x lora/test_gptoss_tp.py
|
||||
- pytest -v -s -x lora/test_qwen35_densemoel_lora.py
|
||||
@@ -9,9 +9,9 @@ steps:
|
||||
- vllm/config/model.py
|
||||
- vllm/model_executor
|
||||
- tests/model_executor
|
||||
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
- tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py
|
||||
commands:
|
||||
- apt-get update && apt-get install -y curl libsodium23
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s model_executor
|
||||
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
- pytest -v -s entrypoints/openai/completion/test_tensorizer_entrypoint.py
|
||||
|
||||
@@ -11,7 +11,7 @@ steps:
|
||||
- vllm/v1/attention/
|
||||
- tests/v1/engine/test_llm_engine.py
|
||||
- tests/v1/e2e/
|
||||
- tests/v1/entrypoints/llm/test_struct_output_generate.py
|
||||
- tests/entrypoints/llm/test_struct_output_generate.py
|
||||
commands:
|
||||
- set -x
|
||||
- export VLLM_USE_V2_MODEL_RUNNER=1
|
||||
@@ -22,7 +22,7 @@ steps:
|
||||
- pytest -v -s v1/e2e/general/test_context_length.py
|
||||
- pytest -v -s v1/e2e/general/test_min_tokens.py
|
||||
# Temporary hack filter to exclude ngram spec decoding based tests.
|
||||
- pytest -v -s v1/entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0"
|
||||
- pytest -v -s entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0"
|
||||
|
||||
- label: Model Runner V2 Examples
|
||||
timeout_in_minutes: 45
|
||||
|
||||
@@ -62,7 +62,7 @@ steps:
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
- label: Multi-Modal Processor Test (CPU)
|
||||
- label: Multi-Modal Processor (CPU)
|
||||
depends_on:
|
||||
- image-build-cpu
|
||||
timeout_in_minutes: 60
|
||||
@@ -95,34 +95,44 @@ steps:
|
||||
commands:
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
||||
|
||||
- label: Multi-Modal Models (Extended) 1
|
||||
- label: Multi-Modal Models (Extended Generation 1)
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
- tests/models/multimodal/generation
|
||||
- tests/models/multimodal/test_mapping.py
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
|
||||
- pytest -v -s models/multimodal/generation -m 'not core_model' --ignore models/multimodal/generation/test_common.py
|
||||
- pytest -v -s models/multimodal/test_mapping.py
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
- label: Multi-Modal Models (Extended) 2
|
||||
- label: Multi-Modal Models (Extended Generation 2)
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
- tests/models/multimodal/generation
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
|
||||
|
||||
- label: Multi-Modal Models (Extended) 3
|
||||
- label: Multi-Modal Models (Extended Generation 3)
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
- tests/models/multimodal/generation
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
|
||||
|
||||
- label: Multi-Modal Models (Extended Pooling)
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal/pooling
|
||||
commands:
|
||||
- pytest -v -s models/multimodal/pooling -m 'not core_model'
|
||||
|
||||
@@ -36,6 +36,6 @@ steps:
|
||||
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
||||
- pip install -e ./plugins/vllm_add_dummy_model
|
||||
- pytest -v -s distributed/test_distributed_oot.py
|
||||
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/openai/chat_completion/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
|
||||
|
||||
@@ -35,7 +35,7 @@ steps:
|
||||
# as it is a heavy test that is covered in other steps.
|
||||
# Use `find` to launch multiple instances of pytest so that
|
||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"
|
||||
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
|
||||
|
||||
- label: PyTorch Fullgraph
|
||||
timeout_in_minutes: 30
|
||||
|
||||
3
.github/CODEOWNERS
vendored
3
.github/CODEOWNERS
vendored
@@ -75,7 +75,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
|
||||
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 @pavanimajety
|
||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
||||
/tests/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
||||
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
||||
/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
|
||||
/tests/weight_loading @mgoin @youkaichao @yewentao256
|
||||
@@ -171,6 +171,7 @@ mkdocs.yaml @hmellor
|
||||
|
||||
# Pooling models
|
||||
/examples/pooling @noooop
|
||||
/docs/models/pooling_models @noooop
|
||||
/tests/models/*/pooling* @noooop
|
||||
/tests/entrypoints/pooling @noooop
|
||||
/vllm/config/pooler.py @noooop
|
||||
|
||||
11
.github/mergify.yml
vendored
11
.github/mergify.yml
vendored
@@ -260,7 +260,7 @@ pull_request_rules:
|
||||
- files=examples/offline_inference/structured_outputs.py
|
||||
- files=examples/online_serving/structured_outputs/structured_outputs.py
|
||||
- files~=^tests/v1/structured_output/
|
||||
- files=tests/v1/entrypoints/llm/test_struct_output_generate.py
|
||||
- files=tests/entrypoints/llm/test_struct_output_generate.py
|
||||
- files~=^vllm/v1/structured_output/
|
||||
actions:
|
||||
label:
|
||||
@@ -333,9 +333,10 @@ pull_request_rules:
|
||||
- label != stale
|
||||
- or:
|
||||
- files~=^tests/tool_use/
|
||||
- files~=^tests/entrypoints/openai/tool_parsers/
|
||||
- files=tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py
|
||||
- files~=^vllm/entrypoints/openai/tool_parsers/
|
||||
- files~=^tests/tool_parsers/
|
||||
- files~=^tests/entrypoints/openai/.*tool.*
|
||||
- files~=^tests/entrypoints/anthropic/.*tool.*
|
||||
- files~=^vllm/tool_parsers/
|
||||
- files=docs/features/tool_calling.md
|
||||
- files~=^examples/tool_chat_*
|
||||
- files=examples/offline_inference/chat_with_tools.py
|
||||
@@ -381,7 +382,7 @@ pull_request_rules:
|
||||
- or:
|
||||
- files~=^vllm/model_executor/model_loader/tensorizer.py
|
||||
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
|
||||
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
- files~=^tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py
|
||||
- files~=^tests/model_executor/model_loader/tensorizer_loader/
|
||||
actions:
|
||||
assign:
|
||||
|
||||
50
.github/scripts/cleanup_pr_body.sh
vendored
50
.github/scripts/cleanup_pr_body.sh
vendored
@@ -1,50 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -eu
|
||||
|
||||
# ensure 1 argument is passed
|
||||
if [ "$#" -ne 1 ]; then
|
||||
echo "Usage: $0 <pr_number>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
PR_NUMBER=$1
|
||||
OLD=/tmp/orig_pr_body.txt
|
||||
NEW=/tmp/new_pr_body.txt
|
||||
|
||||
gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
|
||||
cp "${OLD}" "${NEW}"
|
||||
|
||||
# Remove markdown comments (like the <!-- markdownlint-disable --> at the start)
|
||||
sed -i '/<!--.*-->$/d' "${NEW}"
|
||||
|
||||
# Remove "PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED."
|
||||
sed -i '/PLEASE FILL IN THE PR DESCRIPTION HERE.*$/d' "${NEW}"
|
||||
|
||||
# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
|
||||
sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
|
||||
|
||||
# Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
|
||||
python3 - <<EOF
|
||||
import regex as re
|
||||
|
||||
with open("${NEW}", "r") as file:
|
||||
content = file.read()
|
||||
|
||||
pattern = re.compile(r'(---\n\n)?<details>.*?<summary>.*?PR Checklist \(Click to Expand\).*?</summary>.*?</details>', re.DOTALL)
|
||||
content = re.sub(pattern, '', content)
|
||||
|
||||
with open("${NEW}", "w") as file:
|
||||
file.write(content)
|
||||
EOF
|
||||
|
||||
# Run this only if ${NEW} is different than ${OLD}
|
||||
if ! cmp -s "${OLD}" "${NEW}"; then
|
||||
gh pr edit --body-file "${NEW}" "${PR_NUMBER}"
|
||||
echo
|
||||
echo "Updated PR body:"
|
||||
echo
|
||||
cat "${NEW}"
|
||||
else
|
||||
echo "No changes needed"
|
||||
fi
|
||||
32
.github/workflows/cleanup_pr_body.yml
vendored
32
.github/workflows/cleanup_pr_body.yml
vendored
@@ -1,32 +0,0 @@
|
||||
name: Cleanup PR Body
|
||||
|
||||
on:
|
||||
pull_request_target:
|
||||
types: [opened, reopened, edited]
|
||||
|
||||
permissions:
|
||||
pull-requests: write
|
||||
|
||||
jobs:
|
||||
update-description:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
|
||||
with:
|
||||
python-version: '3.12'
|
||||
cache: 'pip'
|
||||
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
python3 -m pip install --upgrade pip
|
||||
python3 -m pip install regex
|
||||
|
||||
- name: Update PR description
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: bash .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
|
||||
105
.github/workflows/issue_autolabel.yml
vendored
105
.github/workflows/issue_autolabel.yml
vendored
@@ -383,4 +383,107 @@ jobs:
|
||||
core.notice(`All users for label "${label}" already mentioned, skipping comment`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
- name: Request missing ROCm info from issue author
|
||||
if: contains(steps.label-step.outputs.labels_added, 'rocm') && contains(toJSON(github.event.issue.labels.*.name), 'bug')
|
||||
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
||||
with:
|
||||
script: |
|
||||
const body = (context.payload.issue.body || '').toLowerCase();
|
||||
|
||||
// Check for existing bot comments to avoid duplicate requests
|
||||
const comments = await github.rest.issues.listComments({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: context.issue.number,
|
||||
});
|
||||
const botAlreadyAsked = comments.data.some(
|
||||
c => c.user.type === 'Bot' && c.body.includes('<!-- rocm-info-request -->')
|
||||
);
|
||||
if (botAlreadyAsked) {
|
||||
core.notice('ROCm info request already posted, skipping');
|
||||
return;
|
||||
}
|
||||
|
||||
// Define required information and detection patterns
|
||||
const requiredInfo = [
|
||||
{
|
||||
name: 'Reproducer',
|
||||
patterns: [
|
||||
/reproduc/i, /minimal.?example/i, /repro\b/i, /steps to reproduce/i,
|
||||
/code.?snippet/i, /sample.?code/i,
|
||||
/```python[\s\S]*?```/, /```bash[\s\S]*?```/, /```sh[\s\S]*?```/,
|
||||
],
|
||||
ask: 'A minimal reproducer (code snippet or script that triggers the issue)',
|
||||
},
|
||||
{
|
||||
name: 'Error message',
|
||||
patterns: [
|
||||
/error/i, /traceback/i, /exception/i, /fault/i, /crash/i,
|
||||
/failed/i, /abort/i, /panic/i,
|
||||
],
|
||||
ask: 'The full error message or traceback',
|
||||
},
|
||||
{
|
||||
name: 'Installation method',
|
||||
patterns: [
|
||||
/docker/i, /rocm\/pytorch/i, /dockerfile/i, /from source/i,
|
||||
/pip install/i, /build.?from/i, /container/i, /image/i,
|
||||
/wheel/i, /\.whl/i, /nightly/i,
|
||||
],
|
||||
ask: 'How you installed vLLM (Docker image name, pip install, or build from source steps)',
|
||||
},
|
||||
{
|
||||
name: 'Command',
|
||||
patterns: [
|
||||
/vllm serve/i, /python\s+\S+\.py/i, /```bash[\s\S]*?```/,
|
||||
/```sh[\s\S]*?```/, /command/i, /launch/i, /run\s/i,
|
||||
/--model/i, /--tensor-parallel/i, /--gpu-memory/i,
|
||||
],
|
||||
ask: 'The command you used to launch vLLM (e.g., `vllm serve ...` or the Python script)',
|
||||
},
|
||||
{
|
||||
name: 'GFX architecture',
|
||||
patterns: [
|
||||
/gfx\d{3,4}/i, /mi\d{3}/i, /mi\d{2}\b/i, /radeon/i,
|
||||
/gpu.?arch/i, /rocm-smi/i, /rocminfo/i, /navi/i,
|
||||
/instinct/i,
|
||||
],
|
||||
ask: 'Your GPU model and GFX architecture (e.g., MI300X / gfx942) — run `rocminfo | grep gfx`',
|
||||
},
|
||||
];
|
||||
|
||||
const issueBody = context.payload.issue.body || '';
|
||||
const missing = requiredInfo.filter(info =>
|
||||
!info.patterns.some(p => p.test(issueBody))
|
||||
);
|
||||
|
||||
if (missing.length === 0) {
|
||||
core.notice('All required ROCm info appears to be present');
|
||||
return;
|
||||
}
|
||||
|
||||
const author = context.payload.issue.user.login;
|
||||
const checklist = requiredInfo.map(info => {
|
||||
const found = !missing.includes(info);
|
||||
return `- [${found ? 'x' : ' '}] ${info.ask}`;
|
||||
}).join('\n');
|
||||
const message = [
|
||||
'<!-- rocm-info-request -->',
|
||||
`Hi @${author}, thanks for reporting this ROCm issue!`,
|
||||
'',
|
||||
'To help us investigate, please make sure the following information is included:',
|
||||
'',
|
||||
checklist,
|
||||
'',
|
||||
'Please provide any unchecked items above. This will help us reproduce and resolve the issue faster. Thank you!',
|
||||
].join('\n');
|
||||
|
||||
await github.rest.issues.createComment({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: context.issue.number,
|
||||
body: message,
|
||||
});
|
||||
core.notice(`Requested missing ROCm info from @${author}: ${missing.map(m => m.name).join(', ')}`);
|
||||
6
.github/workflows/macos-smoke-test.yml
vendored
6
.github/workflows/macos-smoke-test.yml
vendored
@@ -1,9 +1,9 @@
|
||||
name: macOS Apple Silicon Smoke Test
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
schedule:
|
||||
# Daily at 2:30 AM UTC
|
||||
- cron: '30 2 * * *'
|
||||
workflow_dispatch: # Manual trigger
|
||||
|
||||
permissions:
|
||||
|
||||
96
.github/workflows/new_pr_bot.yml
vendored
Normal file
96
.github/workflows/new_pr_bot.yml
vendored
Normal file
@@ -0,0 +1,96 @@
|
||||
name: New PR Bot
|
||||
|
||||
on:
|
||||
pull_request_target:
|
||||
types: [opened]
|
||||
|
||||
permissions:
|
||||
pull-requests: write
|
||||
|
||||
jobs:
|
||||
update-description:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Update PR description
|
||||
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
||||
with:
|
||||
script: |
|
||||
const { owner, repo } = context.repo;
|
||||
const pr_number = context.issue.number;
|
||||
|
||||
const { data: pr } = await github.rest.pulls.get({
|
||||
owner,
|
||||
repo,
|
||||
pull_number: pr_number,
|
||||
});
|
||||
|
||||
let body = pr.body || '';
|
||||
const original = body;
|
||||
|
||||
// Remove markdown comments (<!-- ... -->)
|
||||
body = body.replace(/^<!--.*-->$/gm, '');
|
||||
|
||||
// Remove "PLEASE FILL IN THE PR DESCRIPTION HERE ..."
|
||||
body = body.replace(/^PLEASE FILL IN THE PR DESCRIPTION HERE.*$/gm, '');
|
||||
|
||||
// Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ ..."
|
||||
body = body.replace(/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*[\s\S]*$/, '');
|
||||
|
||||
// Remove <details> section containing "PR Checklist (Click to Expand)"
|
||||
body = body.replace(/(---\n\n)?<details>[\s\S]*?<summary>[\s\S]*?PR Checklist \(Click to Expand\)[\s\S]*?<\/summary>[\s\S]*?<\/details>/g, '');
|
||||
|
||||
if (body !== original) {
|
||||
await github.rest.pulls.update({
|
||||
owner,
|
||||
repo,
|
||||
pull_number: pr_number,
|
||||
body,
|
||||
});
|
||||
console.log('Updated PR body');
|
||||
} else {
|
||||
console.log('No changes needed');
|
||||
}
|
||||
|
||||
reminder-comment:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Post welcome comment for first-time contributors
|
||||
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
||||
with:
|
||||
script: |
|
||||
const { owner, repo } = context.repo;
|
||||
const prAuthor = context.payload.pull_request.user.login;
|
||||
|
||||
const { data: searchResults } = await github.rest.search.issuesAndPullRequests({
|
||||
q: `repo:${owner}/${repo} type:pr author:${prAuthor}`,
|
||||
per_page: 1,
|
||||
});
|
||||
|
||||
const authorPRCount = searchResults.total_count;
|
||||
console.log(`Found ${authorPRCount} PRs by ${prAuthor}`);
|
||||
|
||||
if (authorPRCount === 1) {
|
||||
console.log(`Posting welcome comment for first-time contributor: ${prAuthor}`);
|
||||
await github.rest.issues.createComment({
|
||||
owner,
|
||||
repo,
|
||||
issue_number: context.issue.number,
|
||||
body: [
|
||||
'\u{1f44b} Hi! Thank you for contributing to the vLLM project.',
|
||||
'',
|
||||
'\u{1f4ac} Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.',
|
||||
'',
|
||||
'Just a reminder: PRs would not trigger full CI run by default.',
|
||||
'',
|
||||
'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.',
|
||||
'',
|
||||
'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.',
|
||||
'',
|
||||
'If you have any questions, please reach out to us on Slack at https://slack.vllm.ai.',
|
||||
'',
|
||||
'\u{1f680}',
|
||||
].join('\n'),
|
||||
});
|
||||
} else {
|
||||
console.log(`Skipping comment for ${prAuthor} - not their first PR (${authorPRCount} PRs found)`);
|
||||
}
|
||||
30
.github/workflows/pre-commit.yml
vendored
30
.github/workflows/pre-commit.yml
vendored
@@ -11,9 +11,39 @@ concurrency:
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: read
|
||||
|
||||
jobs:
|
||||
pre-run-check:
|
||||
if: github.event_name == 'pull_request'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check PR label and author merge count
|
||||
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
||||
with:
|
||||
script: |
|
||||
const { data: pr } = await github.rest.pulls.get({
|
||||
...context.repo,
|
||||
pull_number: context.payload.pull_request.number,
|
||||
});
|
||||
|
||||
const hasReadyLabel = pr.labels.some(l => l.name === 'ready');
|
||||
|
||||
const { data: mergedPRs } = await github.rest.search.issuesAndPullRequests({
|
||||
q: `repo:${context.repo.owner}/${context.repo.repo} is:pr is:merged author:${pr.user.login}`,
|
||||
per_page: 4,
|
||||
});
|
||||
const mergedCount = mergedPRs.total_count;
|
||||
|
||||
if (hasReadyLabel || mergedCount >= 4) {
|
||||
core.info(`Check passed: ready label=${hasReadyLabel}, 4+ merged PRs=${mergedCount >= 4}`);
|
||||
} else {
|
||||
core.setFailed(`PR must have the 'ready' label or the author must have at least 4 merged PRs (found ${mergedCount}).`);
|
||||
}
|
||||
|
||||
pre-commit:
|
||||
needs: pre-run-check
|
||||
if: always() && (needs.pre-run-check.result == 'success' || needs.pre-run-check.result == 'skipped')
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
|
||||
|
||||
54
.github/workflows/reminder_comment.yml
vendored
54
.github/workflows/reminder_comment.yml
vendored
@@ -1,54 +0,0 @@
|
||||
name: PR Reminder Comment Bot
|
||||
permissions:
|
||||
pull-requests: write
|
||||
on:
|
||||
pull_request_target:
|
||||
types: [opened]
|
||||
jobs:
|
||||
pr_reminder:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Remind to run full CI on PR
|
||||
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
||||
with:
|
||||
script: |
|
||||
try {
|
||||
// Get the PR author
|
||||
const prAuthor = context.payload.pull_request.user.login;
|
||||
|
||||
// Check if this is the author's first PR in this repository
|
||||
// Use GitHub's search API to find all PRs by this author
|
||||
const { data: searchResults } = await github.rest.search.issuesAndPullRequests({
|
||||
q: `repo:${context.repo.owner}/${context.repo.repo} type:pr author:${prAuthor}`,
|
||||
per_page: 100
|
||||
});
|
||||
|
||||
const authorPRCount = searchResults.total_count;
|
||||
|
||||
console.log(`Found ${authorPRCount} PRs by ${prAuthor}`);
|
||||
|
||||
// Only post comment if this is the first PR (only one PR by this author)
|
||||
if (authorPRCount === 1) {
|
||||
console.log(`Posting welcome comment for first-time contributor: ${prAuthor}`);
|
||||
await github.rest.issues.createComment({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: context.issue.number,
|
||||
body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
|
||||
'💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
|
||||
'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. \n\n' +
|
||||
'You ask your reviewers to trigger select CI tests on top of `fastcheck` CI. \n\n' +
|
||||
'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
|
||||
'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
|
||||
'If you have any questions, please reach out to us on Slack at https://slack.vllm.ai.\n\n' +
|
||||
'🚀'
|
||||
});
|
||||
} else {
|
||||
console.log(`Skipping comment for ${prAuthor} - not their first PR (${authorPRCount} PRs found)`);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error checking PR history or posting comment:', error);
|
||||
// Don't fail the workflow, just log the error
|
||||
}
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
@@ -340,7 +340,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
|
||||
list(APPEND VLLM_EXT_SRC
|
||||
"csrc/quantization/awq/gemm_kernels.cu"
|
||||
"csrc/permute_cols.cu"
|
||||
"csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu"
|
||||
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
||||
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
||||
@@ -986,6 +985,48 @@ define_extension_target(
|
||||
# Setting this variable sidesteps the issue by calling the driver directly.
|
||||
target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
|
||||
|
||||
# add OR VLLM_GPU_LANG STREQUAL "HIP" here once
|
||||
# https://github.com/vllm-project/vllm/issues/35163 is resolved
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
#
|
||||
# _C_stable_libtorch extension (ops registered via STABLE_TORCH_LIBRARY)
|
||||
#
|
||||
set(VLLM_STABLE_EXT_SRC
|
||||
"csrc/libtorch_stable/torch_bindings.cpp")
|
||||
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
list(APPEND VLLM_STABLE_EXT_SRC "csrc/libtorch_stable/permute_cols.cu")
|
||||
endif()
|
||||
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${VLLM_STABLE_EXT_SRC}"
|
||||
CUDA_ARCHS "${CUDA_ARCHS}")
|
||||
endif()
|
||||
|
||||
message(STATUS "Enabling C_stable extension.")
|
||||
define_extension_target(
|
||||
_C_stable_libtorch
|
||||
DESTINATION vllm
|
||||
LANGUAGE ${VLLM_GPU_LANG}
|
||||
SOURCES ${VLLM_STABLE_EXT_SRC}
|
||||
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
||||
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
||||
USE_SABI 3
|
||||
WITH_SOABI)
|
||||
|
||||
# Set TORCH_TARGET_VERSION for stable ABI compatibility.
|
||||
# This ensures we only use C-shim APIs available in PyTorch 2.10.
|
||||
# _C_stable_libtorch is abi compatible with PyTorch >= TORCH_TARGET_VERSION
|
||||
# which is currently set to 2.10.
|
||||
target_compile_definitions(_C_stable_libtorch PRIVATE
|
||||
TORCH_TARGET_VERSION=0x020A000000000000ULL)
|
||||
|
||||
# Needed to use cuda APIs from C-shim
|
||||
target_compile_definitions(_C_stable_libtorch PRIVATE
|
||||
USE_CUDA)
|
||||
endif()
|
||||
|
||||
#
|
||||
# _moe_C extension
|
||||
#
|
||||
@@ -999,6 +1040,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
list(APPEND VLLM_MOE_EXT_SRC
|
||||
"csrc/moe/moe_wna16.cu"
|
||||
"csrc/moe/grouped_topk_kernels.cu"
|
||||
"csrc/moe/gpt_oss_router_gemm.cu"
|
||||
"csrc/moe/router_gemm.cu")
|
||||
endif()
|
||||
|
||||
|
||||
@@ -47,6 +47,8 @@ from common import (
|
||||
is_mla_backend,
|
||||
)
|
||||
|
||||
from vllm.v1.worker.workspace import init_workspace_manager
|
||||
|
||||
|
||||
def run_standard_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
|
||||
"""Run standard attention benchmark (Flash/Triton/FlashInfer)."""
|
||||
@@ -462,7 +464,7 @@ def main():
|
||||
parser.add_argument(
|
||||
"--batch-specs",
|
||||
nargs="+",
|
||||
default=["q2k", "8q1s1k"],
|
||||
default=None,
|
||||
help="Batch specifications using extended grammar",
|
||||
)
|
||||
|
||||
@@ -478,6 +480,21 @@ def main():
|
||||
parser.add_argument("--repeats", type=int, default=1, help="Repetitions")
|
||||
parser.add_argument("--warmup-iters", type=int, default=3, help="Warmup iterations")
|
||||
parser.add_argument("--profile-memory", action="store_true", help="Profile memory")
|
||||
parser.add_argument(
|
||||
"--kv-cache-dtype",
|
||||
default="auto",
|
||||
choices=["auto", "fp8"],
|
||||
help="KV cache dtype: auto or fp8",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cuda-graphs",
|
||||
action=argparse.BooleanOptionalAction,
|
||||
default=True,
|
||||
help=(
|
||||
"Launch kernels with CUDA graphs to eliminate CPU overhead"
|
||||
"in measurements (default: True)"
|
||||
),
|
||||
)
|
||||
|
||||
# Parameter sweep (use YAML config for advanced sweeps)
|
||||
parser.add_argument(
|
||||
@@ -536,21 +553,24 @@ def main():
|
||||
|
||||
# Batch specs and sizes
|
||||
# Support both explicit batch_specs and generated batch_spec_ranges
|
||||
if "batch_spec_ranges" in yaml_config:
|
||||
# Generate batch specs from ranges
|
||||
generated_specs = generate_batch_specs_from_ranges(
|
||||
yaml_config["batch_spec_ranges"]
|
||||
)
|
||||
# Combine with any explicit batch_specs
|
||||
if "batch_specs" in yaml_config:
|
||||
args.batch_specs = yaml_config["batch_specs"] + generated_specs
|
||||
else:
|
||||
args.batch_specs = generated_specs
|
||||
console.print(
|
||||
f"[dim]Generated {len(generated_specs)} batch specs from ranges[/]"
|
||||
)
|
||||
elif "batch_specs" in yaml_config:
|
||||
args.batch_specs = yaml_config["batch_specs"]
|
||||
# CLI --batch-specs takes precedence over YAML when provided.
|
||||
cli_batch_specs_provided = args.batch_specs is not None
|
||||
if not cli_batch_specs_provided:
|
||||
if "batch_spec_ranges" in yaml_config:
|
||||
# Generate batch specs from ranges
|
||||
generated_specs = generate_batch_specs_from_ranges(
|
||||
yaml_config["batch_spec_ranges"]
|
||||
)
|
||||
# Combine with any explicit batch_specs
|
||||
if "batch_specs" in yaml_config:
|
||||
args.batch_specs = yaml_config["batch_specs"] + generated_specs
|
||||
else:
|
||||
args.batch_specs = generated_specs
|
||||
console.print(
|
||||
f"[dim]Generated {len(generated_specs)} batch specs from ranges[/]"
|
||||
)
|
||||
elif "batch_specs" in yaml_config:
|
||||
args.batch_specs = yaml_config["batch_specs"]
|
||||
|
||||
if "batch_sizes" in yaml_config:
|
||||
args.batch_sizes = yaml_config["batch_sizes"]
|
||||
@@ -575,6 +595,10 @@ def main():
|
||||
args.warmup_iters = yaml_config["warmup_iters"]
|
||||
if "profile_memory" in yaml_config:
|
||||
args.profile_memory = yaml_config["profile_memory"]
|
||||
if "kv_cache_dtype" in yaml_config:
|
||||
args.kv_cache_dtype = yaml_config["kv_cache_dtype"]
|
||||
if "cuda_graphs" in yaml_config:
|
||||
args.cuda_graphs = yaml_config["cuda_graphs"]
|
||||
|
||||
# Parameter sweep configuration
|
||||
if "parameter_sweep" in yaml_config:
|
||||
@@ -629,12 +653,18 @@ def main():
|
||||
# Determine backends
|
||||
backends = args.backends or ([args.backend] if args.backend else ["flash"])
|
||||
prefill_backends = getattr(args, "prefill_backends", None)
|
||||
if not args.batch_specs:
|
||||
args.batch_specs = ["q2k", "8q1s1k"]
|
||||
console.print(f"Backends: {', '.join(backends)}")
|
||||
if prefill_backends:
|
||||
console.print(f"Prefill backends: {', '.join(prefill_backends)}")
|
||||
console.print(f"Batch specs: {', '.join(args.batch_specs)}")
|
||||
console.print(f"KV cache dtype: {args.kv_cache_dtype}")
|
||||
console.print(f"CUDA graphs: {args.cuda_graphs}")
|
||||
console.print()
|
||||
|
||||
init_workspace_manager(args.device)
|
||||
|
||||
# Run benchmarks
|
||||
all_results = []
|
||||
|
||||
@@ -687,6 +717,8 @@ def main():
|
||||
repeats=args.repeats,
|
||||
warmup_iters=args.warmup_iters,
|
||||
profile_memory=args.profile_memory,
|
||||
kv_cache_dtype=args.kv_cache_dtype,
|
||||
use_cuda_graphs=args.cuda_graphs,
|
||||
)
|
||||
|
||||
# Add decode pipeline config
|
||||
@@ -839,6 +871,8 @@ def main():
|
||||
"repeats": args.repeats,
|
||||
"warmup_iters": args.warmup_iters,
|
||||
"profile_memory": args.profile_memory,
|
||||
"kv_cache_dtype": args.kv_cache_dtype,
|
||||
"use_cuda_graphs": args.cuda_graphs,
|
||||
}
|
||||
all_results = run_model_parameter_sweep(
|
||||
backends,
|
||||
@@ -861,6 +895,8 @@ def main():
|
||||
"repeats": args.repeats,
|
||||
"warmup_iters": args.warmup_iters,
|
||||
"profile_memory": args.profile_memory,
|
||||
"kv_cache_dtype": args.kv_cache_dtype,
|
||||
"use_cuda_graphs": args.cuda_graphs,
|
||||
}
|
||||
all_results = run_parameter_sweep(
|
||||
backends, args.batch_specs, base_config_args, args.parameter_sweep, console
|
||||
@@ -891,6 +927,8 @@ def main():
|
||||
repeats=args.repeats,
|
||||
warmup_iters=args.warmup_iters,
|
||||
profile_memory=args.profile_memory,
|
||||
kv_cache_dtype=args.kv_cache_dtype,
|
||||
use_cuda_graphs=args.cuda_graphs,
|
||||
)
|
||||
|
||||
result = run_benchmark(config)
|
||||
|
||||
@@ -213,6 +213,9 @@ class BenchmarkConfig:
|
||||
profile_memory: bool = False
|
||||
use_cuda_graphs: bool = False
|
||||
|
||||
# "auto" or "fp8"
|
||||
kv_cache_dtype: str = "auto"
|
||||
|
||||
# MLA-specific
|
||||
prefill_backend: str | None = None
|
||||
kv_lora_rank: int | None = None
|
||||
@@ -369,6 +372,7 @@ class ResultsFormatter:
|
||||
"backend",
|
||||
"batch_spec",
|
||||
"num_layers",
|
||||
"kv_cache_dtype",
|
||||
"mean_time",
|
||||
"std_time",
|
||||
"throughput",
|
||||
@@ -382,6 +386,7 @@ class ResultsFormatter:
|
||||
"backend": r.config.backend,
|
||||
"batch_spec": r.config.batch_spec,
|
||||
"num_layers": r.config.num_layers,
|
||||
"kv_cache_dtype": r.config.kv_cache_dtype,
|
||||
"mean_time": r.mean_time,
|
||||
"std_time": r.std_time,
|
||||
"throughput": r.throughput_tokens_per_sec or 0,
|
||||
|
||||
@@ -30,9 +30,9 @@ batch_specs:
|
||||
- "2q16k_32q1s4k" # 2 very large prefill + 32 decode
|
||||
|
||||
# Context extension + decode
|
||||
- "2q1kkv2k_16q1s1k" # 2 extend + 16 decode
|
||||
- "4q2kkv4k_32q1s2k" # 4 extend + 32 decode
|
||||
- "2q1kkv8k_32q1s2k" # 2 large extend + 32 decode
|
||||
- "2q1ks2k_16q1s1k" # 2 extend + 16 decode
|
||||
- "4q2ks4k_32q1s2k" # 4 extend + 32 decode
|
||||
- "2q1ks8k_32q1s2k" # 2 large extend + 32 decode
|
||||
|
||||
# Explicitly chunked prefill
|
||||
- "q8k" # 8k prefill with chunking hint
|
||||
|
||||
@@ -0,0 +1,58 @@
|
||||
# MLA decode-only benchmark configuration
|
||||
|
||||
model:
|
||||
name: "deepseek-v3"
|
||||
num_layers: 60
|
||||
num_q_heads: 128 # Base value, can be swept for TP simulation
|
||||
num_kv_heads: 1 # MLA uses single latent KV
|
||||
head_dim: 576
|
||||
kv_lora_rank: 512
|
||||
qk_nope_head_dim: 128
|
||||
qk_rope_head_dim: 64
|
||||
v_head_dim: 128
|
||||
block_size: 128 # CUTLASS MLA and FlashAttn MLA use 128
|
||||
|
||||
# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
|
||||
# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
|
||||
model_parameter_sweep:
|
||||
param_name: "num_q_heads"
|
||||
values: [128, 64, 32, 16]
|
||||
label_format: "{backend}_{value}h"
|
||||
|
||||
batch_specs:
|
||||
# Small batches, varying sequence lengths
|
||||
- "16q1s512" # 16 requests, 512 KV cache
|
||||
- "16q1s1k" # 16 requests, 1k KV cache
|
||||
- "16q1s2k" # 16 requests, 2k KV cache
|
||||
- "16q1s4k" # 16 requests, 4k KV cache
|
||||
|
||||
# Medium batches
|
||||
- "32q1s1k" # 32 requests, 1k KV cache
|
||||
- "32q1s2k" # 32 requests, 2k KV cache
|
||||
- "32q1s4k" # 32 requests, 4k KV cache
|
||||
- "32q1s8k" # 32 requests, 8k KV cache
|
||||
|
||||
# Large batches
|
||||
- "64q1s1k" # 64 requests, 1k KV cache
|
||||
- "64q1s2k" # 64 requests, 2k KV cache
|
||||
- "64q1s4k" # 64 requests, 4k KV cache
|
||||
- "64q1s8k" # 64 requests, 8k KV cache
|
||||
|
||||
# Very large batches
|
||||
- "128q1s1k" # 128 requests, 1k KV cache
|
||||
- "128q1s2k" # 128 requests, 2k KV cache
|
||||
- "128q1s4k" # 128 requests, 4k KV cache
|
||||
- "128q1s8k" # 128 requests, 8k KV cache
|
||||
|
||||
# Long context
|
||||
- "32q1s16k" # 32 requests, 16k KV cache
|
||||
- "32q1s32k" # 32 requests, 32k KV cache
|
||||
|
||||
backends:
|
||||
- FLASHMLA_SPARSE
|
||||
- FLASHINFER_MLA_SPARSE
|
||||
|
||||
device: "cuda:0"
|
||||
repeats: 100
|
||||
warmup_iters: 10
|
||||
profile_memory: true
|
||||
@@ -60,9 +60,11 @@ def create_minimal_vllm_config(
|
||||
model_name: str = "deepseek-v3",
|
||||
block_size: int = 128,
|
||||
max_num_seqs: int = 256,
|
||||
max_num_batched_tokens: int = 8192,
|
||||
mla_dims: dict | None = None,
|
||||
index_topk: int | None = None,
|
||||
prefill_backend: str | None = None,
|
||||
kv_cache_dtype: str = "auto",
|
||||
) -> VllmConfig:
|
||||
"""
|
||||
Create minimal VllmConfig for MLA benchmarks.
|
||||
@@ -149,13 +151,13 @@ def create_minimal_vllm_config(
|
||||
cache_config = CacheConfig(
|
||||
block_size=block_size,
|
||||
gpu_memory_utilization=0.9,
|
||||
cache_dtype="auto",
|
||||
cache_dtype=kv_cache_dtype,
|
||||
enable_prefix_caching=False,
|
||||
)
|
||||
|
||||
scheduler_config = SchedulerConfig(
|
||||
max_num_seqs=max_num_seqs,
|
||||
max_num_batched_tokens=8192,
|
||||
max_num_batched_tokens=max(max_num_batched_tokens, max_num_seqs),
|
||||
max_model_len=32768,
|
||||
is_encoder_decoder=False,
|
||||
enable_chunked_prefill=True,
|
||||
@@ -535,6 +537,7 @@ def _create_backend_impl(
|
||||
device: torch.device,
|
||||
max_num_tokens: int = 8192,
|
||||
index_topk: int | None = None,
|
||||
kv_cache_dtype: str = "auto",
|
||||
):
|
||||
"""
|
||||
Create backend implementation instance.
|
||||
@@ -583,7 +586,7 @@ def _create_backend_impl(
|
||||
"num_kv_heads": mla_dims["num_kv_heads"],
|
||||
"alibi_slopes": None,
|
||||
"sliding_window": None,
|
||||
"kv_cache_dtype": "auto",
|
||||
"kv_cache_dtype": kv_cache_dtype,
|
||||
"logits_soft_cap": None,
|
||||
"attn_type": "decoder",
|
||||
"kv_sharing_target_layer_name": None,
|
||||
@@ -701,6 +704,7 @@ def _run_single_benchmark(
|
||||
mla_dims: dict,
|
||||
device: torch.device,
|
||||
indexer=None,
|
||||
kv_cache_dtype: str | None = None,
|
||||
) -> BenchmarkResult:
|
||||
"""
|
||||
Run a single benchmark iteration.
|
||||
@@ -734,49 +738,124 @@ def _run_single_benchmark(
|
||||
)
|
||||
|
||||
# Create KV cache
|
||||
kv_cache = torch.zeros(
|
||||
num_blocks,
|
||||
block_size,
|
||||
mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
|
||||
device=device,
|
||||
dtype=torch.bfloat16,
|
||||
)
|
||||
if kv_cache_dtype is None:
|
||||
kv_cache_dtype = getattr(config, "kv_cache_dtype", "auto")
|
||||
head_size = mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"]
|
||||
if kv_cache_dtype == "fp8_ds_mla":
|
||||
# FlashMLA sparse custom format: 656 bytes per token, stored as uint8.
|
||||
# Layout: kv_lora_rank fp8 bytes + 4 float32 tile scales
|
||||
# + 2*rope_dim bf16 bytes
|
||||
# = 512 + 16 + 128 = 656 bytes for DeepSeek dims.
|
||||
kv_cache = torch.zeros(
|
||||
num_blocks,
|
||||
block_size,
|
||||
656,
|
||||
device=device,
|
||||
dtype=torch.uint8,
|
||||
)
|
||||
elif kv_cache_dtype == "fp8":
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
# Create input tensors for both decode and prefill modes
|
||||
decode_inputs, prefill_inputs = _create_input_tensors(
|
||||
total_q,
|
||||
mla_dims,
|
||||
backend_cfg["query_format"],
|
||||
device,
|
||||
torch.bfloat16,
|
||||
)
|
||||
kv_cache = torch.zeros(
|
||||
num_blocks,
|
||||
block_size,
|
||||
head_size,
|
||||
device=device,
|
||||
dtype=torch.uint8,
|
||||
).view(current_platform.fp8_dtype())
|
||||
else:
|
||||
kv_cache = torch.zeros(
|
||||
num_blocks,
|
||||
block_size,
|
||||
head_size,
|
||||
device=device,
|
||||
dtype=torch.bfloat16,
|
||||
)
|
||||
|
||||
# Fill indexer with random indices for sparse backends
|
||||
is_sparse = backend_cfg.get("is_sparse", False)
|
||||
if is_sparse and indexer is not None:
|
||||
indexer.fill_random_indices(total_q, max_kv_len)
|
||||
|
||||
# Determine which forward method to use based on metadata
|
||||
if metadata.decode is not None:
|
||||
forward_fn = lambda: impl.forward_mqa(decode_inputs, kv_cache, metadata, layer)
|
||||
elif metadata.prefill is not None:
|
||||
forward_fn = lambda: impl.forward_mha(
|
||||
prefill_inputs["q"],
|
||||
prefill_inputs["k_c_normed"],
|
||||
prefill_inputs["k_pe"],
|
||||
kv_cache,
|
||||
metadata,
|
||||
prefill_inputs["k_scale"],
|
||||
prefill_inputs["output"],
|
||||
)
|
||||
else:
|
||||
# Determine which forward methods to use based on metadata.
|
||||
# Sparse MLA backends always use forward_mqa
|
||||
has_decode = is_sparse or getattr(metadata, "decode", None) is not None
|
||||
has_prefill = not is_sparse and getattr(metadata, "prefill", None) is not None
|
||||
if not has_decode and not has_prefill:
|
||||
raise RuntimeError("Metadata has neither decode nor prefill metadata")
|
||||
|
||||
num_decode = (
|
||||
metadata.num_decode_tokens
|
||||
if (has_decode and has_prefill)
|
||||
else total_q
|
||||
if has_decode
|
||||
else 0
|
||||
)
|
||||
num_prefill = total_q - num_decode
|
||||
|
||||
# Some backends requires fp8 queries when using fp8 KV cache.
|
||||
is_fp8_kvcache = kv_cache_dtype.startswith("fp8")
|
||||
quantize_query = is_fp8_kvcache and getattr(
|
||||
impl, "supports_quant_query_input", False
|
||||
)
|
||||
|
||||
# quantize_query forces concat format
|
||||
query_fmt = "concat" if quantize_query else backend_cfg["query_format"]
|
||||
|
||||
# Create decode query tensors
|
||||
if has_decode:
|
||||
decode_inputs, _ = _create_input_tensors(
|
||||
num_decode, mla_dims, query_fmt, device, torch.bfloat16
|
||||
)
|
||||
# Cast decode query to fp8 if the backend supports it
|
||||
if quantize_query:
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if isinstance(decode_inputs, tuple):
|
||||
decode_inputs = torch.cat(list(decode_inputs), dim=-1)
|
||||
decode_inputs = decode_inputs.to(current_platform.fp8_dtype())
|
||||
|
||||
# Create prefill input tensors
|
||||
if has_prefill:
|
||||
_, prefill_inputs = _create_input_tensors(
|
||||
num_prefill, mla_dims, query_fmt, device, torch.bfloat16
|
||||
)
|
||||
|
||||
# Build forward function
|
||||
def forward_fn():
|
||||
results = []
|
||||
if has_decode:
|
||||
results.append(impl.forward_mqa(decode_inputs, kv_cache, metadata, layer))
|
||||
if has_prefill:
|
||||
results.append(
|
||||
impl.forward_mha(
|
||||
prefill_inputs["q"],
|
||||
prefill_inputs["k_c_normed"],
|
||||
prefill_inputs["k_pe"],
|
||||
kv_cache,
|
||||
metadata,
|
||||
prefill_inputs["k_scale"],
|
||||
prefill_inputs["output"],
|
||||
)
|
||||
)
|
||||
return results[0] if len(results) == 1 else tuple(results)
|
||||
|
||||
# Warmup
|
||||
for _ in range(config.warmup_iters):
|
||||
forward_fn()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
# Optionally capture a CUDA graph after warmup.
|
||||
# Graph replay eliminates CPU launch overhead so timings reflect pure
|
||||
# kernel time.
|
||||
if config.use_cuda_graphs:
|
||||
graph = torch.cuda.CUDAGraph()
|
||||
with torch.cuda.graph(graph):
|
||||
forward_fn()
|
||||
benchmark_fn = graph.replay
|
||||
else:
|
||||
benchmark_fn = forward_fn
|
||||
|
||||
# Benchmark
|
||||
times = []
|
||||
for _ in range(config.repeats):
|
||||
@@ -785,7 +864,7 @@ def _run_single_benchmark(
|
||||
|
||||
start.record()
|
||||
for _ in range(config.num_layers):
|
||||
forward_fn()
|
||||
benchmark_fn()
|
||||
end.record()
|
||||
|
||||
torch.accelerator.synchronize()
|
||||
@@ -852,13 +931,30 @@ def _run_mla_benchmark_batched(
|
||||
# Determine if this is a sparse backend
|
||||
is_sparse = backend_cfg.get("is_sparse", False)
|
||||
|
||||
# Extract kv_cache_dtype from the first config
|
||||
kv_cache_dtype = getattr(first_config, "kv_cache_dtype", "auto")
|
||||
|
||||
# FlashMLA sparse only supports "fp8_ds_mla" internally (not generic "fp8").
|
||||
# Remap here so the user can pass --kv-cache-dtype fp8 regardless of backend.
|
||||
if backend.upper() == "FLASHMLA_SPARSE" and kv_cache_dtype == "fp8":
|
||||
kv_cache_dtype = "fp8_ds_mla"
|
||||
|
||||
# Compute max total_q across all configs so the metadata builder buffer
|
||||
# and scheduler config are large enough for all batch specs.
|
||||
max_total_q = max(
|
||||
sum(r.q_len for r in parse_batch_spec(cfg.batch_spec))
|
||||
for cfg, *_ in configs_with_params
|
||||
)
|
||||
|
||||
# Create and set vLLM config for MLA (reused across all benchmarks)
|
||||
vllm_config = create_minimal_vllm_config(
|
||||
model_name="deepseek-v3", # Used only for model path
|
||||
block_size=block_size,
|
||||
max_num_batched_tokens=max_total_q,
|
||||
mla_dims=mla_dims, # Use custom dims from config or default
|
||||
index_topk=index_topk if is_sparse else None,
|
||||
prefill_backend=prefill_backend,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
)
|
||||
|
||||
results = []
|
||||
@@ -883,7 +979,9 @@ def _run_mla_benchmark_batched(
|
||||
mla_dims,
|
||||
vllm_config,
|
||||
device,
|
||||
max_num_tokens=max_total_q,
|
||||
index_topk=index_topk if is_sparse else None,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
)
|
||||
|
||||
# Verify the actual prefill backend matches what was requested
|
||||
@@ -942,6 +1040,7 @@ def _run_mla_benchmark_batched(
|
||||
mla_dims,
|
||||
device,
|
||||
indexer=indexer,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
)
|
||||
results.append(result)
|
||||
|
||||
|
||||
@@ -140,7 +140,7 @@ def _create_vllm_config(
|
||||
|
||||
cache_config = CacheConfig(
|
||||
block_size=config.block_size,
|
||||
cache_dtype="auto",
|
||||
cache_dtype=config.kv_cache_dtype,
|
||||
)
|
||||
cache_config.num_gpu_blocks = max_num_blocks
|
||||
cache_config.num_cpu_blocks = 0
|
||||
@@ -215,7 +215,7 @@ def _create_backend_impl(
|
||||
num_kv_heads=config.num_kv_heads,
|
||||
alibi_slopes=None,
|
||||
sliding_window=None,
|
||||
kv_cache_dtype="auto",
|
||||
kv_cache_dtype=config.kv_cache_dtype,
|
||||
)
|
||||
|
||||
kv_cache_spec = FullAttentionSpec(
|
||||
@@ -288,12 +288,22 @@ def _create_input_tensors(
|
||||
total_q: int,
|
||||
device: torch.device,
|
||||
dtype: torch.dtype,
|
||||
quantize_query: bool = False,
|
||||
) -> tuple:
|
||||
"""Create Q, K, V input tensors for all layers."""
|
||||
"""Create Q, K, V input tensors for all layers.
|
||||
|
||||
When quantize_query is True, queries are cast to fp8 to match backends
|
||||
that require query/key/value dtype consistency.
|
||||
"""
|
||||
q_dtype = dtype
|
||||
if quantize_query:
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
q_dtype = current_platform.fp8_dtype()
|
||||
q_list = [
|
||||
torch.randn(
|
||||
total_q, config.num_q_heads, config.head_dim, device=device, dtype=dtype
|
||||
)
|
||||
).to(q_dtype)
|
||||
for _ in range(config.num_layers)
|
||||
]
|
||||
k_list = [
|
||||
@@ -344,10 +354,17 @@ def _create_kv_cache(
|
||||
# Compute inverse permutation to get back to logical view
|
||||
inv_order = [stride_order.index(i) for i in range(len(stride_order))]
|
||||
|
||||
# Use fp8 dtype for cache when requested.
|
||||
cache_dtype = dtype
|
||||
if config.kv_cache_dtype == "fp8":
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
cache_dtype = current_platform.fp8_dtype()
|
||||
|
||||
cache_list = []
|
||||
for _ in range(config.num_layers):
|
||||
# Allocate in physical layout order (contiguous in memory)
|
||||
cache = torch.zeros(*physical_shape, device=device, dtype=dtype)
|
||||
cache = torch.zeros(*physical_shape, device=device, dtype=cache_dtype)
|
||||
# Permute to logical view
|
||||
cache = cache.permute(*inv_order)
|
||||
cache_list.append(cache)
|
||||
@@ -392,6 +409,37 @@ def _run_single_benchmark(
|
||||
)
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
# Optionally capture a CUDA graph after warmup.
|
||||
# Graph replay eliminates CPU launch overhead so timings reflect pure
|
||||
# kernel time.
|
||||
if config.use_cuda_graphs:
|
||||
graph = torch.cuda.CUDAGraph()
|
||||
with torch.cuda.graph(graph):
|
||||
for i in range(config.num_layers):
|
||||
impl.forward(
|
||||
layer,
|
||||
q_list[i],
|
||||
k_list[i],
|
||||
v_list[i],
|
||||
cache_list[i],
|
||||
attn_metadata,
|
||||
output=out,
|
||||
)
|
||||
benchmark_fn = graph.replay
|
||||
else:
|
||||
|
||||
def benchmark_fn():
|
||||
for i in range(config.num_layers):
|
||||
impl.forward(
|
||||
layer,
|
||||
q_list[i],
|
||||
k_list[i],
|
||||
v_list[i],
|
||||
cache_list[i],
|
||||
attn_metadata,
|
||||
output=out,
|
||||
)
|
||||
|
||||
# Benchmark
|
||||
times = []
|
||||
for _ in range(config.repeats):
|
||||
@@ -399,16 +447,7 @@ def _run_single_benchmark(
|
||||
end = torch.cuda.Event(enable_timing=True)
|
||||
|
||||
start.record()
|
||||
for i in range(config.num_layers):
|
||||
impl.forward(
|
||||
layer,
|
||||
q_list[i],
|
||||
k_list[i],
|
||||
v_list[i],
|
||||
cache_list[i],
|
||||
attn_metadata,
|
||||
output=out,
|
||||
)
|
||||
benchmark_fn()
|
||||
end.record()
|
||||
|
||||
torch.accelerator.synchronize()
|
||||
@@ -502,8 +541,12 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
|
||||
common_attn_metadata=common_metadata,
|
||||
)
|
||||
|
||||
# Only quantize queries when the impl supports it
|
||||
quantize_query = config.kv_cache_dtype.startswith("fp8") and getattr(
|
||||
impl, "supports_quant_query_input", False
|
||||
)
|
||||
q_list, k_list, v_list = _create_input_tensors(
|
||||
config, total_q, device, dtype
|
||||
config, total_q, device, dtype, quantize_query=quantize_query
|
||||
)
|
||||
|
||||
cache_list = _create_kv_cache(
|
||||
|
||||
@@ -40,9 +40,9 @@ LLM engine. You can refer to the `vllm.engine.arg_utils.EngineArgs` for more
|
||||
details.
|
||||
"""
|
||||
|
||||
import dataclasses
|
||||
import random
|
||||
import time
|
||||
from dataclasses import fields
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
@@ -124,7 +124,7 @@ def main(args):
|
||||
|
||||
# Create the LLM engine
|
||||
engine_args = EngineArgs.from_cli_args(args)
|
||||
llm = LLM(**dataclasses.asdict(engine_args))
|
||||
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
|
||||
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
|
||||
|
||||
print("------warm up------")
|
||||
|
||||
@@ -32,6 +32,7 @@ import dataclasses
|
||||
import json
|
||||
import random
|
||||
import time
|
||||
from dataclasses import fields
|
||||
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
@@ -196,7 +197,7 @@ def main(args):
|
||||
|
||||
engine_args = EngineArgs.from_cli_args(args)
|
||||
|
||||
llm = LLM(**dataclasses.asdict(engine_args))
|
||||
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
|
||||
@@ -3,10 +3,10 @@
|
||||
"""Benchmark offline prioritization."""
|
||||
|
||||
import argparse
|
||||
import dataclasses
|
||||
import json
|
||||
import random
|
||||
import time
|
||||
from dataclasses import fields
|
||||
|
||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||
|
||||
@@ -79,7 +79,7 @@ def run_vllm(
|
||||
) -> float:
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
llm = LLM(**dataclasses.asdict(engine_args))
|
||||
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
|
||||
|
||||
assert all(
|
||||
llm.llm_engine.model_config.max_model_len >= (request[1] + request[2])
|
||||
|
||||
@@ -750,17 +750,20 @@ def get_weight_block_size_safety(config, default_value=None):
|
||||
|
||||
|
||||
def get_model_params(config):
|
||||
if config.architectures[0] == "DbrxForCausalLM":
|
||||
architectures = getattr(config, "architectures", None) or [type(config).__name__]
|
||||
architecture = architectures[0]
|
||||
|
||||
if architecture == "DbrxForCausalLM":
|
||||
E = config.ffn_config.moe_num_experts
|
||||
topk = config.ffn_config.moe_top_k
|
||||
intermediate_size = config.ffn_config.ffn_hidden_size
|
||||
hidden_size = config.hidden_size
|
||||
elif config.architectures[0] == "JambaForCausalLM":
|
||||
elif architecture == "JambaForCausalLM":
|
||||
E = config.num_experts
|
||||
topk = config.num_experts_per_tok
|
||||
intermediate_size = config.intermediate_size
|
||||
hidden_size = config.hidden_size
|
||||
elif config.architectures[0] in (
|
||||
elif architecture in (
|
||||
"DeepseekV2ForCausalLM",
|
||||
"DeepseekV3ForCausalLM",
|
||||
"DeepseekV32ForCausalLM",
|
||||
@@ -774,7 +777,7 @@ def get_model_params(config):
|
||||
topk = config.num_experts_per_tok
|
||||
intermediate_size = config.moe_intermediate_size
|
||||
hidden_size = config.hidden_size
|
||||
elif config.architectures[0] in (
|
||||
elif architecture in (
|
||||
"Qwen2MoeForCausalLM",
|
||||
"Qwen3MoeForCausalLM",
|
||||
"Qwen3NextForCausalLM",
|
||||
@@ -783,23 +786,27 @@ def get_model_params(config):
|
||||
topk = config.num_experts_per_tok
|
||||
intermediate_size = config.moe_intermediate_size
|
||||
hidden_size = config.hidden_size
|
||||
elif config.architectures[0] == "Qwen3VLMoeForConditionalGeneration":
|
||||
elif architecture in (
|
||||
"Qwen3VLMoeForConditionalGeneration",
|
||||
"Qwen3_5MoeForConditionalGeneration",
|
||||
"Qwen3_5MoeTextConfig",
|
||||
):
|
||||
text_config = config.get_text_config()
|
||||
E = text_config.num_experts
|
||||
topk = text_config.num_experts_per_tok
|
||||
intermediate_size = text_config.moe_intermediate_size
|
||||
hidden_size = text_config.hidden_size
|
||||
elif config.architectures[0] == "HunYuanMoEV1ForCausalLM":
|
||||
elif architecture == "HunYuanMoEV1ForCausalLM":
|
||||
E = config.num_experts
|
||||
topk = config.moe_topk[0]
|
||||
intermediate_size = config.moe_intermediate_size[0]
|
||||
hidden_size = config.hidden_size
|
||||
elif config.architectures[0] == "Qwen3OmniMoeForConditionalGeneration":
|
||||
elif architecture == "Qwen3OmniMoeForConditionalGeneration":
|
||||
E = config.thinker_config.text_config.num_experts
|
||||
topk = config.thinker_config.text_config.num_experts_per_tok
|
||||
intermediate_size = config.thinker_config.text_config.moe_intermediate_size
|
||||
hidden_size = config.thinker_config.text_config.hidden_size
|
||||
elif config.architectures[0] == "PixtralForConditionalGeneration":
|
||||
elif architecture == "PixtralForConditionalGeneration":
|
||||
# Pixtral can contain different LLM architectures,
|
||||
# recurse to get their parameters
|
||||
return get_model_params(config.get_text_config())
|
||||
@@ -814,6 +821,23 @@ def get_model_params(config):
|
||||
return E, topk, intermediate_size, hidden_size
|
||||
|
||||
|
||||
def resolve_dtype(config) -> torch.dtype:
|
||||
if current_platform.is_rocm():
|
||||
return torch.float16
|
||||
|
||||
dtype = getattr(config, "dtype", None)
|
||||
if dtype is not None:
|
||||
return dtype
|
||||
|
||||
if hasattr(config, "get_text_config"):
|
||||
text_config = config.get_text_config()
|
||||
dtype = getattr(text_config, "dtype", None)
|
||||
if dtype is not None:
|
||||
return dtype
|
||||
|
||||
return torch.bfloat16
|
||||
|
||||
|
||||
def get_quantization_group_size(config) -> int | None:
|
||||
"""Extract the quantization group size from the HF model config.
|
||||
|
||||
@@ -861,7 +885,7 @@ def main(args: argparse.Namespace):
|
||||
else:
|
||||
ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
|
||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
|
||||
dtype = resolve_dtype(config)
|
||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
||||
use_int4_w4a16 = args.dtype == "int4_w4a16"
|
||||
|
||||
134
benchmarks/kernels/benchmark_router_gemm.py
Normal file
134
benchmarks/kernels/benchmark_router_gemm.py
Normal file
@@ -0,0 +1,134 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.transformers_utils.config import get_config
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
# Dimensions supported by the DSV3 specialized kernel
|
||||
DSV3_SUPPORTED_NUM_EXPERTS = [256, 384]
|
||||
DSV3_SUPPORTED_HIDDEN_SIZES = [7168]
|
||||
|
||||
# Dimensions supported by the gpt-oss specialized kernel
|
||||
GPT_OSS_SUPPORTED_NUM_EXPERTS = [32, 128]
|
||||
GPT_OSS_SUPPORTED_HIDDEN_SIZES = [2880]
|
||||
|
||||
|
||||
def get_batch_size_range(max_batch_size):
|
||||
return [2**x for x in range(14) if 2**x <= max_batch_size]
|
||||
|
||||
|
||||
def get_model_params(config):
|
||||
if config.architectures[0] in (
|
||||
"DeepseekV2ForCausalLM",
|
||||
"DeepseekV3ForCausalLM",
|
||||
"DeepseekV32ForCausalLM",
|
||||
):
|
||||
num_experts = config.n_routed_experts
|
||||
hidden_size = config.hidden_size
|
||||
elif config.architectures[0] in ("GptOssForCausalLM",):
|
||||
num_experts = config.num_local_experts
|
||||
hidden_size = config.hidden_size
|
||||
else:
|
||||
raise ValueError(f"Unsupported architecture: {config.architectures}")
|
||||
return num_experts, hidden_size
|
||||
|
||||
|
||||
def get_benchmark(model, max_batch_size, trust_remote_code):
|
||||
@triton.testing.perf_report(
|
||||
triton.testing.Benchmark(
|
||||
x_names=["batch_size"],
|
||||
x_vals=get_batch_size_range(max_batch_size),
|
||||
x_log=False,
|
||||
line_arg="provider",
|
||||
line_vals=[
|
||||
"torch",
|
||||
"vllm",
|
||||
],
|
||||
line_names=["PyTorch", "vLLM"],
|
||||
styles=([("blue", "-"), ("red", "-")]),
|
||||
ylabel="TFLOPs",
|
||||
plot_name=f"{model} router gemm throughput",
|
||||
args={},
|
||||
)
|
||||
)
|
||||
def benchmark(batch_size, provider):
|
||||
config = get_config(model=model, trust_remote_code=trust_remote_code)
|
||||
num_experts, hidden_size = get_model_params(config)
|
||||
|
||||
mat_a = torch.randn(
|
||||
(batch_size, hidden_size), dtype=torch.bfloat16, device="cuda"
|
||||
).contiguous()
|
||||
mat_b = torch.randn(
|
||||
(num_experts, hidden_size), dtype=torch.bfloat16, device="cuda"
|
||||
).contiguous()
|
||||
bias = torch.randn(
|
||||
num_experts, dtype=torch.bfloat16, device="cuda"
|
||||
).contiguous()
|
||||
|
||||
is_hopper_or_blackwell = current_platform.is_device_capability(
|
||||
90
|
||||
) or current_platform.is_device_capability_family(100)
|
||||
allow_dsv3_router_gemm = (
|
||||
is_hopper_or_blackwell
|
||||
and num_experts in DSV3_SUPPORTED_NUM_EXPERTS
|
||||
and hidden_size in DSV3_SUPPORTED_HIDDEN_SIZES
|
||||
)
|
||||
allow_gpt_oss_router_gemm = (
|
||||
is_hopper_or_blackwell
|
||||
and num_experts in GPT_OSS_SUPPORTED_NUM_EXPERTS
|
||||
and hidden_size in GPT_OSS_SUPPORTED_HIDDEN_SIZES
|
||||
)
|
||||
|
||||
has_bias = False
|
||||
if allow_gpt_oss_router_gemm:
|
||||
has_bias = True
|
||||
|
||||
quantiles = [0.5, 0.2, 0.8]
|
||||
|
||||
if provider == "torch":
|
||||
|
||||
def runner():
|
||||
if has_bias:
|
||||
F.linear(mat_a, mat_b, bias)
|
||||
else:
|
||||
F.linear(mat_a, mat_b)
|
||||
elif provider == "vllm":
|
||||
|
||||
def runner():
|
||||
if allow_dsv3_router_gemm:
|
||||
ops.dsv3_router_gemm(mat_a, mat_b, torch.bfloat16)
|
||||
elif allow_gpt_oss_router_gemm:
|
||||
ops.gpt_oss_router_gemm(mat_a, mat_b, bias)
|
||||
else:
|
||||
raise ValueError("Unsupported router gemm")
|
||||
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
runner, quantiles=quantiles
|
||||
)
|
||||
|
||||
def tflops(t_ms):
|
||||
flops = 2 * batch_size * hidden_size * num_experts
|
||||
return flops / (t_ms * 1e-3) / 1e12
|
||||
|
||||
return tflops(ms), tflops(max_ms), tflops(min_ms)
|
||||
|
||||
return benchmark
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser()
|
||||
parser.add_argument("--model", type=str, default="openai/gpt-oss-20b")
|
||||
parser.add_argument("--max-batch-size", default=16, type=int)
|
||||
parser.add_argument("--trust-remote-code", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Get the benchmark function
|
||||
benchmark = get_benchmark(args.model, args.max_batch_size, args.trust_remote_code)
|
||||
# Run performance benchmark
|
||||
benchmark.run(print_data=True)
|
||||
@@ -27,7 +27,7 @@ def get_attn_isa(
|
||||
else:
|
||||
if current_platform.get_cpu_architecture() == CpuArchEnum.ARM:
|
||||
return "neon"
|
||||
elif torch._C._cpu._is_amx_tile_supported():
|
||||
elif torch.cpu._is_amx_tile_supported():
|
||||
return "amx"
|
||||
else:
|
||||
return "vec"
|
||||
|
||||
@@ -24,7 +24,7 @@ except (ImportError, AttributeError) as e:
|
||||
sys.exit(1)
|
||||
|
||||
# ISA selection following test_cpu_fused_moe.py pattern
|
||||
ISA_CHOICES = ["amx", "vec"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
|
||||
ISA_CHOICES = ["amx", "vec"] if torch.cpu._is_amx_tile_supported() else ["vec"]
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
|
||||
@@ -39,7 +39,7 @@ else()
|
||||
FetchContent_Declare(
|
||||
vllm-flash-attn
|
||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||
GIT_TAG 1488682bb545f7d020e958a33116b1419d1cfc83
|
||||
GIT_TAG 29210221863736a08f71a866459e368ad1ac4a95
|
||||
GIT_PROGRESS TRUE
|
||||
# Don't share the vllm-flash-attn build between build types
|
||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||
|
||||
@@ -173,10 +173,13 @@ ScratchPadManager::ScratchPadManager() : size_(0), ptr_(nullptr) {
|
||||
void ScratchPadManager::realloc(size_t new_size) {
|
||||
new_size = round(new_size);
|
||||
if (new_size > size_) {
|
||||
void* new_ptr = std::aligned_alloc(64, new_size);
|
||||
TORCH_CHECK(new_ptr != nullptr,
|
||||
"ScratchPadManager: aligned_alloc failed for size ", new_size);
|
||||
if (ptr_ != nullptr) {
|
||||
std::free(ptr_);
|
||||
}
|
||||
ptr_ = std::aligned_alloc(64, new_size);
|
||||
ptr_ = new_ptr;
|
||||
size_ = new_size;
|
||||
}
|
||||
}
|
||||
|
||||
9
csrc/libtorch_stable/ops.h
Normal file
9
csrc/libtorch_stable/ops.h
Normal file
@@ -0,0 +1,9 @@
|
||||
#pragma once
|
||||
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
|
||||
#ifndef USE_ROCM
|
||||
torch::stable::Tensor permute_cols(torch::stable::Tensor const& A,
|
||||
torch::stable::Tensor const& perm);
|
||||
#endif
|
||||
@@ -1,10 +1,13 @@
|
||||
#include <torch/all.h>
|
||||
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
#include <torch/csrc/stable/accelerator.h>
|
||||
#include <torch/csrc/stable/ops.h>
|
||||
#include <torch/headeronly/core/ScalarType.h>
|
||||
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include "torch_utils.h"
|
||||
|
||||
static constexpr int default_threads = 256;
|
||||
static constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
|
||||
|
||||
@@ -64,19 +67,22 @@ __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
|
||||
|
||||
// More efficient version of A[..., perm]
|
||||
// taken from gptq_marlin.cu
|
||||
torch::Tensor permute_cols(torch::Tensor const& A, torch::Tensor const& perm) {
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
|
||||
auto dev = A.get_device();
|
||||
auto stream = at::cuda::getCurrentCUDAStream(dev);
|
||||
torch::stable::Tensor permute_cols(torch::stable::Tensor const& A,
|
||||
torch::stable::Tensor const& perm) {
|
||||
const int32_t dev = A.get_device_index();
|
||||
const torch::stable::accelerator::DeviceGuard device_guard(dev);
|
||||
const auto stream = get_current_cuda_stream(dev);
|
||||
|
||||
TORCH_CHECK(A.scalar_type() == at::kHalf || A.scalar_type() == at::kBFloat16,
|
||||
"Currently only 16bit types are supported");
|
||||
TORCH_CHECK(A.is_contiguous(), "A must be contiguous");
|
||||
TORCH_CHECK(A.size(-1) % 8 == 0,
|
||||
"A columns must be a multiple of 8 (128bits)");
|
||||
auto A_2d = A.view({-1, A.size(-1)});
|
||||
STD_TORCH_CHECK(
|
||||
A.scalar_type() == torch::headeronly::ScalarType::Half ||
|
||||
A.scalar_type() == torch::headeronly::ScalarType::BFloat16,
|
||||
"Currently only 16bit types are supported");
|
||||
STD_TORCH_CHECK(A.is_contiguous(), "A must be contiguous");
|
||||
STD_TORCH_CHECK(A.size(-1) % 8 == 0,
|
||||
"A columns must be a multiple of 8 (128bits)");
|
||||
auto A_2d = torch::stable::view(A, {-1, A.size(-1)});
|
||||
|
||||
torch::Tensor D = torch::empty_like(A);
|
||||
torch::stable::Tensor D = torch::stable::empty_like(A);
|
||||
int sms;
|
||||
cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
|
||||
int block_rows = div_ceil(A_2d.size(0), sms);
|
||||
21
csrc/libtorch_stable/torch_bindings.cpp
Normal file
21
csrc/libtorch_stable/torch_bindings.cpp
Normal file
@@ -0,0 +1,21 @@
|
||||
#include "ops.h"
|
||||
#include "core/registration.h"
|
||||
|
||||
#include <torch/csrc/stable/library.h>
|
||||
|
||||
// Register ops with STABLE_TORCH_LIBRARY for libtorch stable ABI compatibility.
|
||||
// Note: We register under namespace "_C" so ops are accessible as
|
||||
// torch.ops._C.<op_name> for compatibility with existing code.
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(_C, m) {
|
||||
#ifndef USE_ROCM
|
||||
m.def("permute_cols(Tensor A, Tensor perm) -> Tensor");
|
||||
#endif
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(_C, CUDA, m) {
|
||||
#ifndef USE_ROCM
|
||||
m.impl("permute_cols", TORCH_BOX(&permute_cols));
|
||||
#endif
|
||||
}
|
||||
|
||||
REGISTER_EXTENSION(_C_stable_libtorch)
|
||||
13
csrc/libtorch_stable/torch_utils.h
Normal file
13
csrc/libtorch_stable/torch_utils.h
Normal file
@@ -0,0 +1,13 @@
|
||||
#pragma once
|
||||
|
||||
#include <torch/csrc/inductor/aoti_torch/c/shim.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
// Utility to get the current CUDA stream for a given device using stable APIs.
|
||||
// Returns a cudaStream_t for use in kernel launches.
|
||||
inline cudaStream_t get_current_cuda_stream(int32_t device_index) {
|
||||
void* stream_ptr = nullptr;
|
||||
TORCH_ERROR_CODE_CHECK(
|
||||
aoti_torch_get_current_cuda_stream(device_index, &stream_ptr));
|
||||
return reinterpret_cast<cudaStream_t>(stream_ptr);
|
||||
}
|
||||
144
csrc/moe/gpt_oss_router_gemm.cu
Normal file
144
csrc/moe/gpt_oss_router_gemm.cu
Normal file
@@ -0,0 +1,144 @@
|
||||
/*
|
||||
* Adapted from
|
||||
* https://github.com/NVIDIA/TensorRT-LLM/blob/v1.3.0rc7/cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_cuda.cu
|
||||
* Copyright (c) 2025, The vLLM team.
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
|
||||
* All rights reserved. SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAStream.h>
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <torch/all.h>
|
||||
#include "gpt_oss_router_gemm.cuh"
|
||||
|
||||
void launch_gpt_oss_router_gemm(__nv_bfloat16* gA, __nv_bfloat16* gB,
|
||||
__nv_bfloat16* gC, __nv_bfloat16* bias,
|
||||
int batch_size, int output_features,
|
||||
int input_features, cudaStream_t stream) {
|
||||
static int const WARP_TILE_M = 16;
|
||||
static int const TILE_M = WARP_TILE_M;
|
||||
static int const TILE_N = 8;
|
||||
static int const TILE_K = 64;
|
||||
static int const STAGES = 16;
|
||||
static int const STAGE_UNROLL = 4;
|
||||
static bool const PROFILE = false;
|
||||
|
||||
CUtensorMap weight_map{};
|
||||
CUtensorMap activation_map{};
|
||||
|
||||
constexpr uint32_t rank = 2;
|
||||
uint64_t size[rank] = {(uint64_t)input_features, (uint64_t)output_features};
|
||||
uint64_t stride[rank - 1] = {input_features * sizeof(__nv_bfloat16)};
|
||||
uint32_t box_size[rank] = {TILE_K, TILE_M};
|
||||
uint32_t elem_stride[rank] = {1, 1};
|
||||
|
||||
CUresult res = cuTensorMapEncodeTiled(
|
||||
&weight_map, CUtensorMapDataType::CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, rank,
|
||||
gB, size, stride, box_size, elem_stride,
|
||||
CUtensorMapInterleave::CU_TENSOR_MAP_INTERLEAVE_NONE,
|
||||
CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_128B,
|
||||
CUtensorMapL2promotion::CU_TENSOR_MAP_L2_PROMOTION_NONE,
|
||||
CUtensorMapFloatOOBfill::CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
|
||||
TORCH_CHECK(res == CUDA_SUCCESS,
|
||||
"cuTensorMapEncodeTiled failed for weight_map, error code=",
|
||||
static_cast<int>(res));
|
||||
|
||||
size[1] = batch_size;
|
||||
box_size[1] = TILE_N;
|
||||
|
||||
res = cuTensorMapEncodeTiled(
|
||||
&activation_map, CUtensorMapDataType::CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,
|
||||
rank, gA, size, stride, box_size, elem_stride,
|
||||
CUtensorMapInterleave::CU_TENSOR_MAP_INTERLEAVE_NONE,
|
||||
CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_128B,
|
||||
CUtensorMapL2promotion::CU_TENSOR_MAP_L2_PROMOTION_NONE,
|
||||
CUtensorMapFloatOOBfill::CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
|
||||
TORCH_CHECK(res == CUDA_SUCCESS,
|
||||
"cuTensorMapEncodeTiled failed for activation_map, error code=",
|
||||
static_cast<int>(res));
|
||||
|
||||
int smem_size = STAGES * STAGE_UNROLL *
|
||||
(TILE_M * TILE_K * sizeof(__nv_bfloat16) +
|
||||
TILE_N * TILE_K * sizeof(__nv_bfloat16));
|
||||
|
||||
gpuErrChk(cudaFuncSetAttribute(
|
||||
gpt_oss_router_gemm_kernel<WARP_TILE_M, TILE_M, TILE_N, TILE_K, STAGES,
|
||||
STAGE_UNROLL, PROFILE>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
|
||||
|
||||
int tiles_m = (output_features + TILE_M - 1) / TILE_M;
|
||||
int tiles_n = (batch_size + TILE_N - 1) / TILE_N;
|
||||
|
||||
dim3 grid(tiles_m, tiles_n);
|
||||
dim3 block(384);
|
||||
|
||||
cudaLaunchConfig_t config;
|
||||
cudaLaunchAttribute attrs[1];
|
||||
config.gridDim = grid;
|
||||
config.blockDim = block;
|
||||
config.dynamicSmemBytes = smem_size;
|
||||
config.stream = stream;
|
||||
config.attrs = attrs;
|
||||
attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
|
||||
attrs[0].val.programmaticStreamSerializationAllowed = 1;
|
||||
config.numAttrs = 1;
|
||||
|
||||
cudaLaunchKernelEx(
|
||||
&config,
|
||||
&gpt_oss_router_gemm_kernel<WARP_TILE_M, TILE_M, TILE_N, TILE_K, STAGES,
|
||||
STAGE_UNROLL, PROFILE>,
|
||||
gC, gA, gB, bias, output_features, batch_size, input_features, weight_map,
|
||||
activation_map, nullptr);
|
||||
}
|
||||
|
||||
void gpt_oss_router_gemm_cuda_forward(torch::Tensor& output,
|
||||
torch::Tensor input, torch::Tensor weight,
|
||||
torch::Tensor bias) {
|
||||
auto const batch_size = input.size(0);
|
||||
auto const input_dim = input.size(1);
|
||||
auto const output_dim = weight.size(0);
|
||||
|
||||
auto stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
if (input.scalar_type() == at::ScalarType::BFloat16) {
|
||||
launch_gpt_oss_router_gemm((__nv_bfloat16*)input.data_ptr(),
|
||||
(__nv_bfloat16*)weight.data_ptr(),
|
||||
(__nv_bfloat16*)output.mutable_data_ptr(),
|
||||
(__nv_bfloat16*)bias.data_ptr(), batch_size,
|
||||
output_dim, input_dim, stream);
|
||||
} else {
|
||||
throw std::invalid_argument("Unsupported dtype, only supports bfloat16");
|
||||
}
|
||||
}
|
||||
|
||||
void gpt_oss_router_gemm(torch::Tensor& output, torch::Tensor input,
|
||||
torch::Tensor weight, torch::Tensor bias) {
|
||||
TORCH_CHECK(input.dim() == 2, "input must be 2D");
|
||||
TORCH_CHECK(weight.dim() == 2, "weight must be 2D");
|
||||
TORCH_CHECK(bias.dim() == 1, "bias must be 1D");
|
||||
TORCH_CHECK(input.sizes()[1] == weight.sizes()[1],
|
||||
"input.size(1) must match weight.size(1)");
|
||||
TORCH_CHECK(weight.sizes()[0] == bias.sizes()[0],
|
||||
"weight.size(0) must match bias.size(0)");
|
||||
TORCH_CHECK(input.scalar_type() == at::ScalarType::BFloat16,
|
||||
"input tensor must be bfloat16");
|
||||
TORCH_CHECK(weight.scalar_type() == at::ScalarType::BFloat16,
|
||||
"weight tensor must be bfloat16");
|
||||
TORCH_CHECK(bias.scalar_type() == at::ScalarType::BFloat16,
|
||||
"bias tensor must be bfloat16");
|
||||
gpt_oss_router_gemm_cuda_forward(output, input, weight, bias);
|
||||
}
|
||||
447
csrc/moe/gpt_oss_router_gemm.cuh
Normal file
447
csrc/moe/gpt_oss_router_gemm.cuh
Normal file
@@ -0,0 +1,447 @@
|
||||
/*
|
||||
* Adapted from
|
||||
* https://github.com/NVIDIA/TensorRT-LLM/blob/v1.3.0rc7/cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_kernel.cuh
|
||||
* Copyright (c) 2025, The vLLM team.
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
|
||||
* All rights reserved. SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "cuda_bf16.h"
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <vector>
|
||||
|
||||
#include "cuda_pipeline.h"
|
||||
#include <cuda.h>
|
||||
#include <cuda/barrier>
|
||||
#include <cuda/std/utility>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
using barrier = cuda::barrier<cuda::thread_scope_block>;
|
||||
namespace cde = cuda::device::experimental;
|
||||
namespace ptx = cuda::ptx;
|
||||
|
||||
#define gpuErrChk(ans) \
|
||||
{ \
|
||||
gpuAssert((ans), __FILE__, __LINE__); \
|
||||
}
|
||||
|
||||
inline void gpuAssert(cudaError_t code, char const* file, int line,
|
||||
bool abort = true) {
|
||||
if (code != cudaSuccess) {
|
||||
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
|
||||
line);
|
||||
if (abort) {
|
||||
throw std::runtime_error(cudaGetErrorString(code));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
|
||||
__device__ uint64_t gclock64() {
|
||||
unsigned long long int rv;
|
||||
asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(rv));
|
||||
return rv;
|
||||
}
|
||||
|
||||
__device__ void ldmatrix(__nv_bfloat16 rv[2], uint32_t smem_ptr) {
|
||||
int dst;
|
||||
asm volatile("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];\n"
|
||||
: "=r"(dst)
|
||||
: "r"(smem_ptr));
|
||||
int* rvi = reinterpret_cast<int*>(&rv[0]);
|
||||
rvi[0] = dst;
|
||||
}
|
||||
|
||||
__device__ void ldmatrix2(__nv_bfloat16 rv[4], uint32_t smem_ptr) {
|
||||
int x, y;
|
||||
asm volatile("ldmatrix.sync.aligned.x2.m8n8.shared.b16 {%0, %1}, [%2];\n"
|
||||
: "=r"(x), "=r"(y)
|
||||
: "r"(smem_ptr));
|
||||
|
||||
int* rvi = reinterpret_cast<int*>(&rv[0]);
|
||||
rvi[0] = x;
|
||||
rvi[1] = y;
|
||||
}
|
||||
|
||||
__device__ void ldmatrix4(__nv_bfloat16 rv[8], uint32_t smem_ptr) {
|
||||
int x, y, z, w;
|
||||
asm volatile(
|
||||
"ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];"
|
||||
: "=r"(x), "=r"(y), "=r"(z), "=r"(w)
|
||||
: "r"(smem_ptr));
|
||||
int* rvi = reinterpret_cast<int*>(&rv[0]);
|
||||
rvi[0] = x;
|
||||
rvi[1] = y;
|
||||
rvi[2] = z;
|
||||
rvi[3] = w;
|
||||
}
|
||||
|
||||
__device__ void HMMA_1688(float d[4], __nv_bfloat16 a[4], __nv_bfloat16 b[2],
|
||||
float c[4]) {
|
||||
uint32_t const* A = reinterpret_cast<uint32_t const*>(&a[0]);
|
||||
uint32_t const* B = reinterpret_cast<uint32_t const*>(&b[0]);
|
||||
float const* C = reinterpret_cast<float const*>(&c[0]);
|
||||
float* D = reinterpret_cast<float*>(&d[0]);
|
||||
|
||||
asm volatile(
|
||||
"mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32 "
|
||||
"{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
|
||||
: "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
|
||||
: "r"(A[0]), "r"(A[1]), "r"(B[0]), "f"(C[0]), "f"(C[1]), "f"(C[2]),
|
||||
"f"(C[3]));
|
||||
}
|
||||
|
||||
__device__ void HMMA_16816(float d[4], __nv_bfloat16 a[8], __nv_bfloat16 b[4],
|
||||
float c[4]) {
|
||||
uint32_t const* A = reinterpret_cast<uint32_t const*>(&a[0]);
|
||||
uint32_t const* B = reinterpret_cast<uint32_t const*>(&b[0]);
|
||||
float const* C = reinterpret_cast<float const*>(&c[0]);
|
||||
float* D = reinterpret_cast<float*>(&d[0]);
|
||||
|
||||
asm volatile(
|
||||
"mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
|
||||
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
|
||||
: "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
|
||||
: "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
|
||||
"f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
|
||||
}
|
||||
|
||||
__device__ void bar_wait(uint32_t bar_ptr, int phase) {
|
||||
asm volatile(
|
||||
"{\n"
|
||||
".reg .pred P1;\n"
|
||||
"LAB_WAIT:\n"
|
||||
"mbarrier.try_wait.parity.shared::cta.b64 P1, [%0], %1;\n"
|
||||
"@P1 bra.uni DONE;\n"
|
||||
"bra.uni LAB_WAIT;\n"
|
||||
"DONE:\n"
|
||||
"}\n" ::"r"(bar_ptr),
|
||||
"r"(phase));
|
||||
}
|
||||
|
||||
__device__ bool bar_try_wait(uint32_t bar_ptr, int phase) {
|
||||
uint32_t success;
|
||||
#ifdef INTERNAL
|
||||
asm volatile(".pragma \"set knob DontInsertYield\";\n" : : : "memory");
|
||||
#endif
|
||||
asm volatile(
|
||||
"{\n\t"
|
||||
".reg .pred P1; \n\t"
|
||||
"mbarrier.try_wait.parity.shared::cta.b64 P1, [%1], %2; \n\t"
|
||||
"selp.b32 %0, 1, 0, P1; \n\t"
|
||||
"}"
|
||||
: "=r"(success)
|
||||
: "r"(bar_ptr), "r"(phase));
|
||||
return success;
|
||||
}
|
||||
|
||||
__device__ uint32_t elect_one_sync() {
|
||||
uint32_t pred = 0;
|
||||
uint32_t laneid = 0;
|
||||
asm volatile(
|
||||
"{\n"
|
||||
".reg .b32 %%rx;\n"
|
||||
".reg .pred %%px;\n"
|
||||
" elect.sync %%rx|%%px, %2;\n"
|
||||
"@%%px mov.s32 %1, 1;\n"
|
||||
" mov.s32 %0, %%rx;\n"
|
||||
"}\n"
|
||||
: "+r"(laneid), "+r"(pred)
|
||||
: "r"(0xFFFFFFFF));
|
||||
return pred;
|
||||
}
|
||||
#endif
|
||||
|
||||
struct Profile {
|
||||
uint64_t start;
|
||||
uint64_t weight_load_start;
|
||||
uint64_t act_load_start;
|
||||
uint64_t compute_start;
|
||||
uint64_t complete;
|
||||
};
|
||||
|
||||
template <int WARP_TILE_M, int TILE_M, int TILE_N, int TILE_K, int STAGES,
|
||||
int STAGE_UNROLL, bool PROFILE>
|
||||
__global__ __launch_bounds__(384, 1) void gpt_oss_router_gemm_kernel(
|
||||
__nv_bfloat16* output, __nv_bfloat16* weights, __nv_bfloat16* activations,
|
||||
__nv_bfloat16* bias, int M, int N, int K,
|
||||
const __grid_constant__ CUtensorMap weight_map,
|
||||
const __grid_constant__ CUtensorMap activation_map,
|
||||
Profile* profile = nullptr) {
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
|
||||
|
||||
if (PROFILE && threadIdx.x == 0 && blockIdx.y == 0)
|
||||
profile[blockIdx.x].start = gclock64();
|
||||
|
||||
extern __shared__ __align__(128) char smem[];
|
||||
|
||||
__nv_bfloat16* sh_weights = (__nv_bfloat16*)&smem[0];
|
||||
__nv_bfloat16* sh_activations =
|
||||
(__nv_bfloat16*)&smem[STAGES * STAGE_UNROLL * TILE_M * TILE_K *
|
||||
sizeof(__nv_bfloat16)];
|
||||
|
||||
#pragma nv_diag_suppress static_var_with_dynamic_init
|
||||
__shared__ barrier bar_wt_ready[STAGES];
|
||||
__shared__ barrier bar_act_ready[STAGES];
|
||||
__shared__ barrier bar_data_consumed[STAGES];
|
||||
|
||||
__shared__ float4 reduction_buffer[128];
|
||||
|
||||
__shared__ nv_bfloat16 sh_bias[TILE_M];
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
for (int i = 0; i < STAGES; i++) {
|
||||
init(&bar_wt_ready[i], 1);
|
||||
init(&bar_act_ready[i], 1);
|
||||
init(&bar_data_consumed[i], 32);
|
||||
}
|
||||
ptx::fence_proxy_async(ptx::space_shared);
|
||||
asm volatile("prefetch.tensormap [%0];"
|
||||
:
|
||||
: "l"(reinterpret_cast<uint64_t>(&weight_map))
|
||||
: "memory");
|
||||
asm volatile("prefetch.tensormap [%0];"
|
||||
:
|
||||
: "l"(reinterpret_cast<uint64_t>(&activation_map))
|
||||
: "memory");
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int lane_id = threadIdx.x % 32;
|
||||
|
||||
int phase = 0;
|
||||
|
||||
int mib = blockIdx.x * TILE_M;
|
||||
int ni = blockIdx.y * TILE_N;
|
||||
|
||||
float accum[4];
|
||||
for (int i = 0; i < 4; i++) accum[i] = 0.f;
|
||||
|
||||
int const K_LOOPS_DMA =
|
||||
(K + 4 * TILE_K * STAGE_UNROLL - 1) / (4 * (TILE_K * STAGE_UNROLL));
|
||||
int const K_LOOPS_COMPUTE = K_LOOPS_DMA;
|
||||
|
||||
// Data loading thread
|
||||
if (warp_id >= 4 && elect_one_sync()) {
|
||||
int stage = warp_id % 4;
|
||||
|
||||
bool weight_warp = warp_id < 8;
|
||||
if (!weight_warp) {
|
||||
cudaGridDependencySynchronize();
|
||||
cudaTriggerProgrammaticLaunchCompletion();
|
||||
}
|
||||
|
||||
for (int ki = 0; ki < K_LOOPS_DMA; ki++) {
|
||||
int k = (ki * 4 + (warp_id % 4)) * TILE_K * STAGE_UNROLL;
|
||||
|
||||
uint64_t desc_ptr_wt = reinterpret_cast<uint64_t>(&weight_map);
|
||||
uint64_t desc_ptr_act = reinterpret_cast<uint64_t>(&activation_map);
|
||||
|
||||
uint32_t bar_ptr_wt = __cvta_generic_to_shared(&bar_wt_ready[stage]);
|
||||
uint32_t bar_ptr_act = __cvta_generic_to_shared(&bar_act_ready[stage]);
|
||||
int bytes_wt = TILE_M * TILE_K * sizeof(__nv_bfloat16);
|
||||
int bytes_act = TILE_N * TILE_K * sizeof(__nv_bfloat16);
|
||||
|
||||
bar_wait(__cvta_generic_to_shared(&bar_data_consumed[stage]), phase ^ 1);
|
||||
|
||||
if (weight_warp)
|
||||
asm volatile("mbarrier.arrive.expect_tx.shared.b64 _, [%0], %1;"
|
||||
:
|
||||
: "r"(bar_ptr_wt), "r"(STAGE_UNROLL * bytes_wt));
|
||||
if (!weight_warp)
|
||||
asm volatile("mbarrier.arrive.expect_tx.shared.b64 _, [%0], %1;"
|
||||
:
|
||||
: "r"(bar_ptr_act), "r"(STAGE_UNROLL * bytes_act));
|
||||
|
||||
if (PROFILE && blockIdx.y == 0 && ki == 0 && weight_warp)
|
||||
profile[blockIdx.x].weight_load_start = gclock64();
|
||||
if (PROFILE && blockIdx.y == 0 && ki == 0 && !weight_warp)
|
||||
profile[blockIdx.x].act_load_start = gclock64();
|
||||
|
||||
for (int i = 0; i < STAGE_UNROLL; i++) {
|
||||
uint32_t smem_ptr_wt = __cvta_generic_to_shared(
|
||||
&sh_weights[(stage * STAGE_UNROLL + i) * TILE_M * TILE_K]);
|
||||
uint32_t crd0 = k + i * TILE_K;
|
||||
uint32_t crd1 = mib;
|
||||
if (weight_warp)
|
||||
asm volatile(
|
||||
"cp.async.bulk.tensor.2d.shared::cta.global.mbarrier::complete_"
|
||||
"tx::bytes [%0], [%1, {%3,%4}], "
|
||||
"[%2];"
|
||||
:
|
||||
: "r"(smem_ptr_wt), "l"(desc_ptr_wt), "r"(bar_ptr_wt), "r"(crd0),
|
||||
"r"(crd1)
|
||||
: "memory");
|
||||
|
||||
uint32_t smem_ptr_act = __cvta_generic_to_shared(
|
||||
&sh_activations[(stage * STAGE_UNROLL + i) * TILE_N * TILE_K]);
|
||||
crd0 = k + i * TILE_K;
|
||||
crd1 = ni;
|
||||
if (!weight_warp)
|
||||
asm volatile(
|
||||
"cp.async.bulk.tensor.2d.shared::cta.global.mbarrier::complete_"
|
||||
"tx::bytes [%0], [%1, {%3,%4}], "
|
||||
"[%2];"
|
||||
:
|
||||
: "r"(smem_ptr_act), "l"(desc_ptr_act), "r"(bar_ptr_act),
|
||||
"r"(crd0), "r"(crd1)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
stage += 4;
|
||||
if (stage >= STAGES) {
|
||||
stage = warp_id % 4;
|
||||
phase ^= 1;
|
||||
}
|
||||
}
|
||||
// Wait for pending loads to be consumed before exiting, to avoid race
|
||||
for (int i = 0; i < (STAGES / 4) - 1; i++) {
|
||||
bar_wait(__cvta_generic_to_shared(&bar_data_consumed[stage]), phase ^ 1);
|
||||
stage += 4;
|
||||
if (stage >= STAGES) {
|
||||
stage = warp_id % 4;
|
||||
phase ^= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Compute threads
|
||||
else if (warp_id < 4) {
|
||||
// Sneak the bias load into the compute warps since they're just waiting for
|
||||
// stuff anyway
|
||||
if (threadIdx.x < TILE_M) sh_bias[threadIdx.x] = bias[mib + threadIdx.x];
|
||||
|
||||
int stage = warp_id;
|
||||
|
||||
int phase = 0;
|
||||
int lane_id_div8 = lane_id / 8;
|
||||
int lane_id_mod8 = lane_id % 8;
|
||||
|
||||
int lane_row_offset_wt = (lane_id_div8 % 2) ? 8 : 0;
|
||||
int lane_col_offset_wt = (lane_id_div8 / 2) ? 1 : 0;
|
||||
|
||||
int row_wt = lane_id_mod8 + lane_row_offset_wt;
|
||||
int row_act = lane_id_mod8;
|
||||
|
||||
int row_offset_wt = (reinterpret_cast<uintptr_t>(sh_weights) / 128) % 8;
|
||||
int row_offset_act = row_offset_wt;
|
||||
|
||||
uint32_t bar_ptr_wt = __cvta_generic_to_shared(&bar_wt_ready[stage]);
|
||||
uint32_t bar_ptr_act = __cvta_generic_to_shared(&bar_act_ready[stage]);
|
||||
|
||||
bool weight_ready = bar_try_wait(bar_ptr_wt, phase);
|
||||
bool act_ready = bar_try_wait(bar_ptr_act, phase);
|
||||
|
||||
#pragma unroll 2
|
||||
for (int ki = 0; ki < K_LOOPS_COMPUTE; ki++) {
|
||||
int next_stage = stage + 4;
|
||||
int next_phase = phase;
|
||||
if (next_stage >= STAGES) {
|
||||
next_stage = warp_id;
|
||||
next_phase ^= 1;
|
||||
}
|
||||
|
||||
while (!weight_ready || !act_ready) {
|
||||
weight_ready = bar_try_wait(bar_ptr_wt, phase);
|
||||
act_ready = bar_try_wait(bar_ptr_act, phase);
|
||||
}
|
||||
|
||||
if (PROFILE && blockIdx.y == 0 && threadIdx.x == 0 && ki == 0)
|
||||
profile[blockIdx.x].compute_start = gclock64();
|
||||
|
||||
if (ki + 1 < K_LOOPS_COMPUTE) {
|
||||
weight_ready = bar_try_wait(
|
||||
__cvta_generic_to_shared(&bar_wt_ready[next_stage]), next_phase);
|
||||
act_ready = bar_try_wait(
|
||||
__cvta_generic_to_shared(&bar_act_ready[next_stage]), next_phase);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int su = 0; su < STAGE_UNROLL; su++) {
|
||||
__nv_bfloat16* ptr_weights =
|
||||
&sh_weights[(stage * STAGE_UNROLL + su) * TILE_M * TILE_K];
|
||||
__nv_bfloat16* ptr_act =
|
||||
&sh_activations[(stage * STAGE_UNROLL + su) * TILE_N * TILE_K];
|
||||
|
||||
#pragma unroll
|
||||
for (int kii = 0; kii < TILE_K / 16; kii++) {
|
||||
__nv_bfloat16 a[8];
|
||||
__nv_bfloat16 b[4];
|
||||
|
||||
int col = 2 * kii + lane_col_offset_wt;
|
||||
int col_sw = ((row_wt + row_offset_wt) % 8) ^ col;
|
||||
|
||||
ldmatrix4(a, __cvta_generic_to_shared(
|
||||
&ptr_weights[row_wt * TILE_K + col_sw * 8]));
|
||||
|
||||
col = 2 * kii + lane_id_div8;
|
||||
col_sw = ((row_act + row_offset_act) % 8) ^ col;
|
||||
|
||||
ldmatrix2(b, __cvta_generic_to_shared(
|
||||
&ptr_act[row_act * TILE_K + 8 * col_sw]));
|
||||
|
||||
HMMA_16816(accum, a, b, accum);
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t bar_c = __cvta_generic_to_shared(&bar_data_consumed[stage]);
|
||||
asm volatile("mbarrier.arrive.shared::cta.b64 _, [%0];" : : "r"(bar_c));
|
||||
|
||||
stage = next_stage;
|
||||
phase = next_phase;
|
||||
}
|
||||
|
||||
float4 accum4;
|
||||
accum4.x = accum[0];
|
||||
accum4.y = accum[1];
|
||||
accum4.z = accum[2];
|
||||
accum4.w = accum[3];
|
||||
reduction_buffer[threadIdx.x] = accum4;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (warp_id == 0) {
|
||||
int mi = mib + warp_id * WARP_TILE_M;
|
||||
int tm = mi + lane_id / 4;
|
||||
int tn = ni + 2 * (lane_id % 4);
|
||||
|
||||
float4 accum1 = reduction_buffer[32 + threadIdx.x];
|
||||
float4 accum2 = reduction_buffer[64 + threadIdx.x];
|
||||
float4 accum3 = reduction_buffer[96 + threadIdx.x];
|
||||
|
||||
accum[0] = accum[0] + accum1.x + accum2.x + accum3.x;
|
||||
accum[1] = accum[1] + accum1.y + accum2.y + accum3.y;
|
||||
accum[2] = accum[2] + accum1.z + accum2.z + accum3.z;
|
||||
accum[3] = accum[3] + accum1.w + accum2.w + accum3.w;
|
||||
|
||||
float bias_lo = __bfloat162float(sh_bias[tm - mib]);
|
||||
float bias_hi = __bfloat162float(sh_bias[tm + 8 - mib]);
|
||||
|
||||
if (tn < N && tm < M)
|
||||
output[tn * M + tm] = __float2bfloat16(accum[0] + bias_lo);
|
||||
if (tn + 1 < N && tm < M)
|
||||
output[(tn + 1) * M + tm] = __float2bfloat16(accum[1] + bias_lo);
|
||||
if (tn < N && tm + 8 < M)
|
||||
output[tn * M + tm + 8] = __float2bfloat16(accum[2] + bias_hi);
|
||||
if (tn + 1 < N && tm + 8 < M)
|
||||
output[(tn + 1) * M + tm + 8] = __float2bfloat16(accum[3] + bias_hi);
|
||||
|
||||
if (PROFILE && blockIdx.y == 0 && threadIdx.x == 0)
|
||||
profile[blockIdx.x].complete = gclock64();
|
||||
}
|
||||
}
|
||||
#endif // end if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
|
||||
}
|
||||
@@ -70,4 +70,8 @@ torch::Tensor router_gemm_bf16_fp32(torch::Tensor const& input,
|
||||
// Supports num_tokens in [1, 16], num_experts in {256, 384}, hidden_dim = 7168
|
||||
void dsv3_router_gemm(torch::Tensor& output, const torch::Tensor& mat_a,
|
||||
const torch::Tensor& mat_b);
|
||||
|
||||
// gpt-oss optimized router GEMM kernel for SM90+
|
||||
void gpt_oss_router_gemm(torch::Tensor& output, torch::Tensor input,
|
||||
torch::Tensor weight, torch::Tensor bias);
|
||||
#endif
|
||||
|
||||
@@ -132,6 +132,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
||||
// DeepSeek V3 optimized router GEMM for SM90+
|
||||
m.def("dsv3_router_gemm(Tensor! output, Tensor mat_a, Tensor mat_b) -> ()");
|
||||
// conditionally compiled so impl registration is in source file
|
||||
|
||||
// gpt-oss optimized router GEMM kernel for SM90+
|
||||
m.def(
|
||||
"gpt_oss_router_gemm(Tensor! output, Tensor input, Tensor weights, "
|
||||
"Tensor bias) -> ()");
|
||||
m.impl("gpt_oss_router_gemm", torch::kCUDA, &gpt_oss_router_gemm);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
16
csrc/ops.h
16
csrc/ops.h
@@ -201,7 +201,6 @@ torch::Tensor awq_dequantize(torch::Tensor _kernel,
|
||||
torch::Tensor _zeros, int64_t split_k_iters,
|
||||
int64_t thx, int64_t thy);
|
||||
|
||||
torch::Tensor permute_cols(torch::Tensor const& A, torch::Tensor const& perm);
|
||||
#endif
|
||||
|
||||
torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m,
|
||||
@@ -262,7 +261,8 @@ void get_cutlass_moe_mm_data(
|
||||
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
|
||||
torch::Tensor& input_permutation, torch::Tensor& output_permutation,
|
||||
const int64_t num_experts, const int64_t n, const int64_t k,
|
||||
const std::optional<torch::Tensor>& blockscale_offsets);
|
||||
const std::optional<torch::Tensor>& blockscale_offsets,
|
||||
const bool is_gated);
|
||||
|
||||
void get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
|
||||
const torch::Tensor& expert_first_token_offset,
|
||||
@@ -295,10 +295,14 @@ void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,
|
||||
|
||||
std::vector<torch::Tensor> cutlass_sparse_compress(torch::Tensor const& a);
|
||||
|
||||
void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
|
||||
torch::Tensor& output_scale,
|
||||
torch::Tensor const& input_scale,
|
||||
bool is_sf_swizzled_layout);
|
||||
std::tuple<torch::Tensor, torch::Tensor> scaled_fp4_quant_func(
|
||||
torch::Tensor const& input, torch::Tensor const& input_scale,
|
||||
bool is_sf_swizzled_layout);
|
||||
|
||||
void scaled_fp4_quant_out(torch::Tensor const& input,
|
||||
torch::Tensor const& input_scale,
|
||||
bool is_sf_swizzled_layout, torch::Tensor& output,
|
||||
torch::Tensor& output_scale);
|
||||
|
||||
void scaled_fp4_experts_quant(
|
||||
torch::Tensor& output, torch::Tensor& output_scale,
|
||||
|
||||
@@ -16,6 +16,8 @@
|
||||
|
||||
#include <torch/all.h>
|
||||
|
||||
#include "nvfp4_utils.cuh"
|
||||
|
||||
#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
|
||||
(defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
|
||||
void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
|
||||
@@ -51,9 +53,10 @@ void silu_and_mul_scaled_fp4_experts_quant_sm1xxa(
|
||||
torch::Tensor const& output_scale_offset_by_experts);
|
||||
#endif
|
||||
|
||||
void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
|
||||
torch::Tensor& output_sf, torch::Tensor const& input_sf,
|
||||
bool is_sf_swizzled_layout) {
|
||||
void scaled_fp4_quant_out(torch::Tensor const& input,
|
||||
torch::Tensor const& input_sf,
|
||||
bool is_sf_swizzled_layout, torch::Tensor& output,
|
||||
torch::Tensor& output_sf) {
|
||||
#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
|
||||
(defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
|
||||
return scaled_fp4_quant_sm1xxa(output, input, output_sf, input_sf,
|
||||
@@ -62,6 +65,34 @@ void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 quantization kernel");
|
||||
}
|
||||
|
||||
std::tuple<torch::Tensor, torch::Tensor> scaled_fp4_quant_func(
|
||||
torch::Tensor const& input, torch::Tensor const& input_sf,
|
||||
bool is_sf_swizzled_layout) {
|
||||
int64_t n = input.size(-1);
|
||||
int64_t m = input.numel() / n;
|
||||
auto device = input.device();
|
||||
|
||||
// Two fp4 values packed into a uint8
|
||||
auto output = torch::empty(
|
||||
{m, n / 2}, torch::TensorOptions().device(device).dtype(torch::kUInt8));
|
||||
|
||||
torch::Tensor output_sf;
|
||||
if (is_sf_swizzled_layout) {
|
||||
auto [sf_m, sf_n] = vllm::computeSwizzledSFShape(m, n);
|
||||
output_sf = torch::empty(
|
||||
{sf_m, sf_n},
|
||||
torch::TensorOptions().device(device).dtype(torch::kInt32));
|
||||
} else {
|
||||
output_sf = torch::empty(
|
||||
{m, n / CVT_FP4_SF_VEC_SIZE},
|
||||
torch::TensorOptions().device(device).dtype(torch::kUInt8));
|
||||
}
|
||||
|
||||
scaled_fp4_quant_out(input, input_sf, is_sf_swizzled_layout, output,
|
||||
output_sf);
|
||||
return {output, output_sf};
|
||||
}
|
||||
|
||||
void scaled_fp4_experts_quant(
|
||||
torch::Tensor& output, torch::Tensor& output_scale,
|
||||
torch::Tensor const& input, torch::Tensor const& input_global_scale,
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp8.h>
|
||||
#include <utility>
|
||||
|
||||
#include "../../cuda_vec_utils.cuh"
|
||||
|
||||
@@ -54,6 +55,18 @@ inline int computeEffectiveRows(int m) {
|
||||
return round_up(m, ROW_TILE);
|
||||
}
|
||||
|
||||
// Compute the shape of the swizzled SF output tensor.
|
||||
// Returns (rounded_m, rounded_n / 4) where:
|
||||
// rounded_m = round_up(m, 128)
|
||||
// rounded_n = round_up(n / CVT_FP4_SF_VEC_SIZE, 4)
|
||||
inline std::pair<int64_t, int64_t> computeSwizzledSFShape(int64_t m,
|
||||
int64_t n) {
|
||||
int64_t rounded_m = round_up(m, static_cast<int64_t>(128));
|
||||
int64_t scale_n = n / CVT_FP4_SF_VEC_SIZE;
|
||||
int64_t rounded_n = round_up(scale_n, static_cast<int64_t>(4));
|
||||
return {rounded_m, rounded_n / 4};
|
||||
}
|
||||
|
||||
// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
|
||||
inline __device__ uint32_t fp32_vec8_to_e2m1(float (&array)[8]) {
|
||||
uint32_t val;
|
||||
|
||||
@@ -286,6 +286,15 @@ void rms_norm_per_block_quant(torch::Tensor& out, torch::Tensor const& input,
|
||||
"Outer scale stride must be 1 when scales are not transposed");
|
||||
}
|
||||
|
||||
int64_t hidden_size = input.size(-1);
|
||||
TORCH_CHECK(hidden_size > 0 && hidden_size % group_size == 0,
|
||||
"hidden_size must be a positive multiple of group_size");
|
||||
int64_t num_tokens = input.numel() / hidden_size;
|
||||
int64_t num_groups = hidden_size / group_size;
|
||||
TORCH_CHECK(scales.numel() >= num_tokens * num_groups,
|
||||
"scales buffer too small: need ", num_tokens * num_groups,
|
||||
" elements, got ", scales.numel());
|
||||
|
||||
rms_norm_per_block_quant_dispatch(out, input, weight, scales, group_size,
|
||||
var_epsilon, scale_ub, residual,
|
||||
is_scale_transposed);
|
||||
|
||||
@@ -17,8 +17,11 @@ __global__ void compute_problem_sizes(const int32_t* __restrict__ topk_ids,
|
||||
int32_t* problem_sizes2,
|
||||
int32_t* atomic_buffer,
|
||||
const int topk_length, const int n,
|
||||
const int k) {
|
||||
const int k, const bool is_gated) {
|
||||
int expert_id = blockIdx.x;
|
||||
// For gated activations (gate + up), first GEMM output is 2*n.
|
||||
// For non-gated activations (up only), first GEMM output is n.
|
||||
int const n1 = is_gated ? 2 * n : n;
|
||||
|
||||
int occurrences = 0;
|
||||
for (int i = threadIdx.x; i < topk_length; i += THREADS_PER_EXPERT) {
|
||||
@@ -31,13 +34,13 @@ __global__ void compute_problem_sizes(const int32_t* __restrict__ topk_ids,
|
||||
int final_occurrences = atomic_buffer[expert_id];
|
||||
if constexpr (!SWAP_AB) {
|
||||
problem_sizes1[expert_id * 3] = final_occurrences;
|
||||
problem_sizes1[expert_id * 3 + 1] = 2 * n;
|
||||
problem_sizes1[expert_id * 3 + 1] = n1;
|
||||
problem_sizes1[expert_id * 3 + 2] = k;
|
||||
problem_sizes2[expert_id * 3] = final_occurrences;
|
||||
problem_sizes2[expert_id * 3 + 1] = k;
|
||||
problem_sizes2[expert_id * 3 + 2] = n;
|
||||
} else {
|
||||
problem_sizes1[expert_id * 3] = 2 * n;
|
||||
problem_sizes1[expert_id * 3] = n1;
|
||||
problem_sizes1[expert_id * 3 + 1] = final_occurrences;
|
||||
problem_sizes1[expert_id * 3 + 2] = k;
|
||||
problem_sizes2[expert_id * 3] = k;
|
||||
@@ -107,13 +110,11 @@ __global__ void compute_arg_sorts(const int32_t* __restrict__ topk_ids,
|
||||
}
|
||||
|
||||
namespace {
|
||||
inline void launch_compute_problem_sizes(const torch::Tensor& topk_ids,
|
||||
torch::Tensor& problem_sizes1,
|
||||
torch::Tensor& problem_sizes2,
|
||||
torch::Tensor& atomic_buffer,
|
||||
int64_t num_experts, int64_t n,
|
||||
int64_t k, cudaStream_t stream,
|
||||
const bool swap_ab) {
|
||||
inline void launch_compute_problem_sizes(
|
||||
const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
|
||||
torch::Tensor& problem_sizes2, torch::Tensor& atomic_buffer,
|
||||
int64_t num_experts, int64_t n, int64_t k, cudaStream_t stream,
|
||||
const bool swap_ab, const bool is_gated) {
|
||||
int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel());
|
||||
|
||||
auto const* topk_ptr = topk_ids.data_ptr<int32_t>();
|
||||
@@ -125,7 +126,7 @@ inline void launch_compute_problem_sizes(const torch::Tensor& topk_ids,
|
||||
compute_problem_sizes<SwapAB><<<num_experts, num_threads, 0, stream>>>(
|
||||
topk_ptr, ps1_ptr, ps2_ptr, atomic_ptr,
|
||||
static_cast<int>(topk_ids.numel()), static_cast<int>(n),
|
||||
static_cast<int>(k));
|
||||
static_cast<int>(k), is_gated);
|
||||
});
|
||||
}
|
||||
} // namespace
|
||||
@@ -222,7 +223,8 @@ void get_cutlass_moe_mm_data_caller(
|
||||
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
|
||||
torch::Tensor& input_permutation, torch::Tensor& output_permutation,
|
||||
const int64_t num_experts, const int64_t n, const int64_t k,
|
||||
const std::optional<torch::Tensor>& blockscale_offsets) {
|
||||
const std::optional<torch::Tensor>& blockscale_offsets,
|
||||
const bool is_gated) {
|
||||
auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
|
||||
auto options_int32 =
|
||||
torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
|
||||
@@ -236,7 +238,7 @@ void get_cutlass_moe_mm_data_caller(
|
||||
|
||||
launch_compute_problem_sizes(topk_ids, problem_sizes1, problem_sizes2,
|
||||
atomic_buffer, num_experts, n, k, stream,
|
||||
may_swap_ab);
|
||||
may_swap_ab, is_gated);
|
||||
|
||||
if (blockscale_offsets.has_value()) {
|
||||
// fp4 path
|
||||
|
||||
@@ -75,7 +75,8 @@ void get_cutlass_moe_mm_data_caller(
|
||||
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
|
||||
torch::Tensor& input_permutation, torch::Tensor& output_permutation,
|
||||
const int64_t num_experts, const int64_t n, const int64_t k,
|
||||
const std::optional<torch::Tensor>& blockscale_offsets);
|
||||
const std::optional<torch::Tensor>& blockscale_offsets,
|
||||
const bool is_gated);
|
||||
|
||||
void get_cutlass_moe_mm_problem_sizes_from_expert_offsets_caller(
|
||||
const torch::Tensor& expert_first_token_offset,
|
||||
@@ -278,7 +279,8 @@ void get_cutlass_moe_mm_data(
|
||||
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
|
||||
torch::Tensor& input_permutation, torch::Tensor& output_permutation,
|
||||
const int64_t num_experts, const int64_t n, const int64_t k,
|
||||
const std::optional<torch::Tensor>& blockscale_offsets) {
|
||||
const std::optional<torch::Tensor>& blockscale_offsets,
|
||||
const bool is_gated) {
|
||||
// This function currently gets compiled only if we have a valid cutlass moe
|
||||
// mm to run it for.
|
||||
int32_t version_num = get_sm_version_num();
|
||||
@@ -288,7 +290,7 @@ void get_cutlass_moe_mm_data(
|
||||
get_cutlass_moe_mm_data_caller(topk_ids, expert_offsets, problem_sizes1,
|
||||
problem_sizes2, input_permutation,
|
||||
output_permutation, num_experts, n, k,
|
||||
blockscale_offsets);
|
||||
blockscale_offsets, is_gated);
|
||||
return;
|
||||
#endif
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||
|
||||
@@ -26,6 +26,16 @@
|
||||
#define __HIP__GFX9__
|
||||
#endif
|
||||
|
||||
#if defined(__HIPCC__) && \
|
||||
(defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1150__) || \
|
||||
defined(__gfx1151__) || defined(__gfx1200__) || defined(__gfx1201__))
|
||||
#define __HIP__GFX1X__
|
||||
#endif
|
||||
|
||||
#if defined(__HIPCC__) && (defined(__gfx1200__) || defined(__gfx1201__))
|
||||
#define __HIP__GFX12__
|
||||
#endif
|
||||
|
||||
#if defined(__HIPCC__) && (defined(__gfx942__) || defined(__gfx950__))
|
||||
#define __HIP__MI3XX__
|
||||
#endif
|
||||
@@ -37,15 +47,31 @@
|
||||
#endif
|
||||
|
||||
int get_lds_size() {
|
||||
static bool is_cached = false;
|
||||
static int result;
|
||||
if (is_cached == false) {
|
||||
auto dprops = at::cuda::getCurrentDeviceProperties();
|
||||
std::string device_arch = dprops->gcnArchName;
|
||||
size_t substring = device_arch.find("gfx95");
|
||||
result = (substring == std::string::npos ? 64 * 1024 : 160 * 1024);
|
||||
is_cached = true;
|
||||
}
|
||||
static const int result = [] {
|
||||
const auto* dprops = at::cuda::getCurrentDeviceProperties();
|
||||
const std::string device_arch = dprops->gcnArchName;
|
||||
return device_arch.find("gfx95") == std::string::npos ? 64 * 1024
|
||||
: 160 * 1024;
|
||||
}();
|
||||
return result;
|
||||
}
|
||||
|
||||
bool on_gfx1x() {
|
||||
static const bool result = [] {
|
||||
const auto* dprops = at::cuda::getCurrentDeviceProperties();
|
||||
const std::string device_arch = dprops->gcnArchName;
|
||||
return device_arch.find("gfx11") != std::string::npos ||
|
||||
device_arch.find("gfx12") != std::string::npos;
|
||||
}();
|
||||
return result;
|
||||
}
|
||||
|
||||
bool on_gfx12() {
|
||||
static const bool result = [] {
|
||||
const auto* dprops = at::cuda::getCurrentDeviceProperties();
|
||||
const std::string device_arch = dprops->gcnArchName;
|
||||
return device_arch.find("gfx12") != std::string::npos;
|
||||
}();
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -286,21 +312,35 @@ torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b,
|
||||
return out_c;
|
||||
}
|
||||
|
||||
#define DOT2C(V0, V2, V3) \
|
||||
if constexpr (std::is_same_v<scalar_t, half>) { \
|
||||
asm("v_dot2c_f32_f16 %0, %2, %3" : "=v"(V0) : "0"(V0), "v"(V2), "v"(V3)); \
|
||||
} else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) { \
|
||||
float2 s = __bfloat1622float2(*((__hip_bfloat162*)(&(V2)))) * \
|
||||
__bfloat1622float2(*((__hip_bfloat162*)(&(V3)))); \
|
||||
V0 += (s.x + s.y); \
|
||||
}
|
||||
#if defined(__HIP__GFX9__) && !defined(__HIP__GFX1X__)
|
||||
#define DOT2C(V0, V2, V3) \
|
||||
if constexpr (std::is_same_v<scalar_t, half>) { \
|
||||
asm("v_dot2c_f32_f16 %0, %2, %3" \
|
||||
: "=v"(V0) \
|
||||
: "0"(V0), "v"(V2), "v"(V3)); \
|
||||
} else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) { \
|
||||
float2 s = __bfloat1622float2(*((__hip_bfloat162*)(&(V2)))) * \
|
||||
__bfloat1622float2(*((__hip_bfloat162*)(&(V3)))); \
|
||||
V0 += (s.x + s.y); \
|
||||
}
|
||||
#elif defined(__HIP__GFX1X__)
|
||||
// gfx1x: v_dot2_f32_f16 (VOP3-P, dot10-insts, available on gfx11+gfx12)
|
||||
#define DOT2C(V0, V2, V3) \
|
||||
if constexpr (std::is_same_v<scalar_t, half>) { \
|
||||
asm("v_dot2_f32_f16 %0, %1, %2, %0" : "+v"(V0) : "v"(V2), "v"(V3)); \
|
||||
} else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) { \
|
||||
float2 s = __bfloat1622float2(*((__hip_bfloat162*)(&(V2)))) * \
|
||||
__bfloat1622float2(*((__hip_bfloat162*)(&(V3)))); \
|
||||
V0 += (s.x + s.y); \
|
||||
}
|
||||
#endif
|
||||
|
||||
// To avoid LLVM silently upcasting to double
|
||||
__device__ inline unsigned int min__(uint32_t a, uint32_t b) {
|
||||
return min(a, b);
|
||||
}
|
||||
|
||||
#if defined(__HIP__GFX9__) // TODO: Add NAVI support
|
||||
#if defined(__HIP__GFX9__) || defined(__HIP__GFX1X__)
|
||||
// This version targets cases where A[] fits LDS capacity
|
||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||
int UNRL, int N>
|
||||
@@ -442,14 +482,18 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
1); // row_shr2
|
||||
sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x111, 0xf, 0xf,
|
||||
1); // row_shr1
|
||||
#if defined(__HIP__GFX9__)
|
||||
sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x142, 0xf, 0xf,
|
||||
1); // ROW_BCAST15
|
||||
sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x143, 0xf, 0xf,
|
||||
1); // ROW_BCAST31
|
||||
#else
|
||||
sum[n][y] += __shfl_xor(sum[n][y], 16);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
if (threadIdx.x == 63) {
|
||||
if (threadIdx.x == (THRDS - 1)) {
|
||||
scalar_t biases[N][YTILE] = {};
|
||||
if (BIAS)
|
||||
for (int n = 0; n < N; n++) {
|
||||
@@ -469,9 +513,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
#pragma unroll
|
||||
#ifdef __HIP__GFX9__
|
||||
#pragma unroll
|
||||
for (int n = 0; n < N; n++) {
|
||||
#pragma unroll
|
||||
#pragma unroll
|
||||
for (int y = 0; y < YTILE; y++) {
|
||||
/*float accm1 = 0;
|
||||
for (int i=0; i<64; i++)
|
||||
@@ -498,7 +543,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
sum4[n][y][0] = accm;
|
||||
}
|
||||
}
|
||||
if (threadIdx.x == 63) {
|
||||
if (threadIdx.x == (THRDS - 1)) {
|
||||
scalar_t biases[N][YTILE] = {};
|
||||
if (BIAS)
|
||||
for (int n = 0; n < N; n++) {
|
||||
@@ -513,11 +558,12 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // __HIP__GFX9__ (MFMA path)
|
||||
}
|
||||
m += CuCount * _WvPrGrp * YTILE;
|
||||
}
|
||||
}
|
||||
#else // !defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||
#else
|
||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||
int UNRL, int N>
|
||||
__global__ void wvSplitK_hf_sml_(const int K, const int Kbp, const int Kap,
|
||||
@@ -528,9 +574,9 @@ __global__ void wvSplitK_hf_sml_(const int K, const int Kbp, const int Kap,
|
||||
const int _WvPrGrp, const int CuCount) {
|
||||
UNREACHABLE_CODE
|
||||
}
|
||||
#endif // defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||
#endif
|
||||
|
||||
#if defined(__HIP__GFX9__) // TODO: Add NAVI support
|
||||
#if defined(__HIP__GFX9__) || defined(__HIP__GFX1X__)
|
||||
// This version targets cases where A[] marginally exceeds LDS capacity
|
||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||
int UNRL, int N>
|
||||
@@ -657,14 +703,18 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
1); // row_shr2
|
||||
sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x111, 0xf, 0xf,
|
||||
1); // row_shr1
|
||||
#if defined(__HIP__GFX9__)
|
||||
sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x142, 0xf, 0xf,
|
||||
1); // ROW_BCAST15
|
||||
sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x143, 0xf, 0xf,
|
||||
1); // ROW_BCAST31
|
||||
#else
|
||||
sum[n][y] += __shfl_xor(sum[n][y], 16);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
if (threadIdx.x == 63) {
|
||||
if (threadIdx.x == (THRDS - 1)) {
|
||||
scalar_t biases[N][YTILE] = {};
|
||||
if (BIAS)
|
||||
for (int n = 0; n < N; n++) {
|
||||
@@ -686,9 +736,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
#pragma unroll
|
||||
#ifdef __HIP__GFX9__
|
||||
#pragma unroll
|
||||
for (int n = 0; n < N; n++) {
|
||||
#pragma unroll
|
||||
#pragma unroll
|
||||
for (int y = 0; y < YTILE; y++) {
|
||||
// float accm1 = 0;
|
||||
// for (int i=0; i<64; i++)
|
||||
@@ -713,7 +764,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
sum4[n][y][0] = accm;
|
||||
}
|
||||
}
|
||||
if (threadIdx.x == 63) {
|
||||
if (threadIdx.x == (THRDS - 1)) {
|
||||
scalar_t biases[N][YTILE] = {};
|
||||
if (BIAS)
|
||||
for (int n = 0; n < N; n++) {
|
||||
@@ -730,6 +781,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // __HIP__GFX9__ (MFMA path)
|
||||
}
|
||||
|
||||
m += CuCount * _WvPrGrp * YTILE;
|
||||
@@ -746,7 +798,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
}
|
||||
}
|
||||
|
||||
#else // !defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||
#else
|
||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||
int UNRL, int N>
|
||||
__global__ void wvSplitK_hf_(const int K, const int Kbp, const int Kap,
|
||||
@@ -756,9 +808,9 @@ __global__ void wvSplitK_hf_(const int K, const int Kbp, const int Kap,
|
||||
const int _WvPrGrp, const int CuCount) {
|
||||
UNREACHABLE_CODE
|
||||
}
|
||||
#endif // defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||
#endif
|
||||
|
||||
#if defined(__HIP__GFX9__) // TODO: Add NAVI support
|
||||
#if defined(__HIP__GFX9__) || defined(__HIP__GFX1X__)
|
||||
// This version targets big A[] cases, where it is much larger than LDS capacity
|
||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||
int UNRL, int N>
|
||||
@@ -1004,14 +1056,18 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
1); // row_shr2
|
||||
sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x111, 0xf, 0xf,
|
||||
1); // row_shr1
|
||||
#if defined(__HIP__GFX9__)
|
||||
sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x142, 0xf, 0xf,
|
||||
1); // ROW_BCAST15
|
||||
sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x143, 0xf, 0xf,
|
||||
1); // ROW_BCAST31
|
||||
#else
|
||||
sum[n][y] += __shfl_xor(sum[n][y], 16);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
if (threadIdx.x == 63) {
|
||||
if (threadIdx.x == (THRDS - 1)) {
|
||||
scalar_t biases[N][YTILE] = {};
|
||||
if (BIAS)
|
||||
for (int n = 0; n < N; n++) {
|
||||
@@ -1033,9 +1089,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
#pragma unroll
|
||||
#ifdef __HIP__GFX9__
|
||||
#pragma unroll
|
||||
for (int n = 0; n < N; n++) {
|
||||
#pragma unroll
|
||||
#pragma unroll
|
||||
for (int y = 0; y < YTILE; y++) {
|
||||
float accm = sum4[n][y][0];
|
||||
accm += __builtin_amdgcn_mov_dpp(sum4[n][y][1], 0x101, 0xf, 0xf,
|
||||
@@ -1057,7 +1114,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
sum4[n][y][0] = accm;
|
||||
}
|
||||
}
|
||||
if (threadIdx.x == 63) {
|
||||
if (threadIdx.x == (THRDS - 1)) {
|
||||
scalar_t biases[N][YTILE] = {};
|
||||
if (BIAS)
|
||||
for (int n = 0; n < N; n++) {
|
||||
@@ -1074,6 +1131,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // __HIP__GFX9__ (MFMA path)
|
||||
}
|
||||
|
||||
m += CuCount * _WvPrGrp * YTILE;
|
||||
@@ -1090,7 +1148,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
}
|
||||
}
|
||||
}
|
||||
#else // !defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||
#else
|
||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||
int UNRL, int N>
|
||||
__global__ void wvSplitK_hf_big_(const int K, const int Kbp, const int Kap,
|
||||
@@ -1101,7 +1159,7 @@ __global__ void wvSplitK_hf_big_(const int K, const int Kbp, const int Kap,
|
||||
const int _WvPrGrp, const int CuCount) {
|
||||
UNREACHABLE_CODE
|
||||
}
|
||||
#endif // defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||
#endif
|
||||
|
||||
// Find the min val of div2 that doesn't increase N/(div1*div2)
|
||||
int mindiv(int N, int div1, int div2) {
|
||||
@@ -1148,40 +1206,40 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
const int max_lds_len = get_lds_size() / 2;
|
||||
|
||||
#define WVSPLITK(_YTILE, _UNRL, _N) \
|
||||
#define WVSPLITK_CFG(_THRDS, _WVPRGRP, _YTILE, _UNRL, _N) \
|
||||
{ \
|
||||
dim3 block(64, 16); \
|
||||
int __wvPrGrp = mindiv(M_in, CuCount * _YTILE, 16); \
|
||||
dim3 block(_THRDS, _WVPRGRP); \
|
||||
int __wvPrGrp = mindiv(M_in, CuCount * _YTILE, _WVPRGRP); \
|
||||
if ((Kbp_in * N_in <= max_lds_len) && (M_in % _YTILE == 0)) \
|
||||
wvSplitK_hf_sml_<fptype, 64, _YTILE, 16, 8, _UNRL, _N> \
|
||||
wvSplitK_hf_sml_<fptype, _THRDS, _YTILE, _WVPRGRP, 8, _UNRL, _N> \
|
||||
<<<grid, block, 0, stream>>>(K_in, Kap_in, Kbp_in, M_in, Bx_in, \
|
||||
By_in, af4, bf4, biasf4, c, __wvPrGrp, \
|
||||
CuCount); \
|
||||
else if (Kbp_in * N_in <= max_lds_len * 1.2) \
|
||||
wvSplitK_hf_<fptype, 64, _YTILE, 16, 8, _UNRL, _N> \
|
||||
wvSplitK_hf_<fptype, _THRDS, _YTILE, _WVPRGRP, 8, _UNRL, _N> \
|
||||
<<<grid, block, 0, stream>>>(K_in, Kap_in, Kbp_in, M_in, Bx_in, \
|
||||
By_in, af4, bf4, biasf4, c, __wvPrGrp, \
|
||||
CuCount); \
|
||||
else \
|
||||
wvSplitK_hf_big_<fptype, 64, _YTILE, 16, 8, _UNRL, _N> \
|
||||
wvSplitK_hf_big_<fptype, _THRDS, _YTILE, _WVPRGRP, 8, _UNRL, _N> \
|
||||
<<<grid, block, 0, stream>>>(K_in, Kap_in, Kbp_in, M_in, Bx_in, \
|
||||
By_in, af4, bf4, biasf4, c, __wvPrGrp, \
|
||||
CuCount); \
|
||||
}
|
||||
|
||||
#define WVSPLIT_TILE(_sYT, __N) \
|
||||
#define WVSPLIT_TILE_CFG(_THRDS, _WVPRGRP, _sYT, __N) \
|
||||
{ \
|
||||
bool fit_lds = (Kbp_in * N_in <= max_lds_len); \
|
||||
if (_sYT <= 1) \
|
||||
WVSPLITK(1, 4, __N) \
|
||||
WVSPLITK_CFG(_THRDS, _WVPRGRP, 1, 4, __N) \
|
||||
else if ((__N == 1) || (!fit_lds) || (_sYT <= 4 * 2)) \
|
||||
WVSPLITK(2, 2, __N) \
|
||||
WVSPLITK_CFG(_THRDS, _WVPRGRP, 2, 2, __N) \
|
||||
else if (_sYT <= 4 * 3) \
|
||||
WVSPLITK(3, 2, __N) \
|
||||
WVSPLITK_CFG(_THRDS, _WVPRGRP, 3, 2, __N) \
|
||||
else if (__N == 4) \
|
||||
WVSPLITK(4, 1, __N) \
|
||||
WVSPLITK_CFG(_THRDS, _WVPRGRP, 4, 1, __N) \
|
||||
else \
|
||||
WVSPLITK(4, 2, __N) \
|
||||
WVSPLITK_CFG(_THRDS, _WVPRGRP, 4, 2, __N) \
|
||||
}
|
||||
|
||||
AT_DISPATCH_REDUCED_FLOATING_TYPES(in_b.scalar_type(), "wvSplitK", [&] {
|
||||
@@ -1198,18 +1256,31 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
|
||||
// then cut the active waves to balance their distribution...
|
||||
int sYT = (M_in + CuCount * 4 - 1) / (CuCount * 4);
|
||||
|
||||
const bool use_wave32 = on_gfx1x();
|
||||
switch (N_in) {
|
||||
case 1:
|
||||
WVSPLIT_TILE(sYT, 1)
|
||||
if (use_wave32)
|
||||
WVSPLIT_TILE_CFG(32, 16, sYT, 1)
|
||||
else
|
||||
WVSPLIT_TILE_CFG(64, 16, sYT, 1)
|
||||
break;
|
||||
case 2:
|
||||
WVSPLIT_TILE(sYT, 2)
|
||||
if (use_wave32)
|
||||
WVSPLIT_TILE_CFG(32, 16, sYT, 2)
|
||||
else
|
||||
WVSPLIT_TILE_CFG(64, 16, sYT, 2)
|
||||
break;
|
||||
case 3:
|
||||
WVSPLIT_TILE(sYT, 3)
|
||||
if (use_wave32)
|
||||
WVSPLIT_TILE_CFG(32, 16, sYT, 3)
|
||||
else
|
||||
WVSPLIT_TILE_CFG(64, 16, sYT, 3)
|
||||
break;
|
||||
case 4:
|
||||
WVSPLIT_TILE(sYT, 4)
|
||||
if (use_wave32)
|
||||
WVSPLIT_TILE_CFG(32, 16, sYT, 4)
|
||||
else
|
||||
WVSPLIT_TILE_CFG(64, 16, sYT, 4)
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
@@ -1653,7 +1724,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#else // !defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||
#else
|
||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||
int UNRL, int N, int GrpsShrB, int CHUNKK, int DTRMNSTC>
|
||||
__global__ void wvSplitKrc_(const int actlN, const int K, const int Kap,
|
||||
@@ -1688,6 +1759,8 @@ torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
|
||||
TORCH_CHECK(in_a.dtype() == torch::kFloat16 ||
|
||||
in_a.dtype() == torch::kBFloat16);
|
||||
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a));
|
||||
|
||||
auto out_c = torch::empty(
|
||||
{N_in, M_in},
|
||||
torch::TensorOptions().dtype(in_a.dtype()).device(in_a.device()));
|
||||
@@ -1696,7 +1769,6 @@ torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
|
||||
|
||||
dim3 grid(CuCount);
|
||||
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
// const int max_lds_len = get_lds_size() / 2;
|
||||
|
||||
@@ -1773,7 +1845,7 @@ torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
|
||||
return out_c;
|
||||
}
|
||||
|
||||
#if defined(__HIP__MI3XX__) // TODO: Add NAVI support
|
||||
#if defined(__HIP__MI3XX__) || defined(__HIP__GFX12__)
|
||||
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
|
||||
int A_CHUNK, int UNRL, int N>
|
||||
__global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
@@ -1817,12 +1889,17 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
|
||||
uint32_t m = (blockIdx.x * _WvPrGrp + (threadIdx.y % _WvPrGrp)) * YTILE;
|
||||
|
||||
using floatx16 = __attribute__((__vector_size__(16 * sizeof(float)))) float;
|
||||
float sA = *s_A;
|
||||
float sB = *s_B;
|
||||
|
||||
while (m < M) {
|
||||
#ifdef __HIP__GFX12__
|
||||
// gfx12: per-lane scalar accumulation via v_dot4_f32_fp8_fp8
|
||||
float sum[N][YTILE] = {};
|
||||
#else
|
||||
// gfx9: MFMA accumulation
|
||||
scalar8 sum[N][YTILE] = {};
|
||||
#endif
|
||||
for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
|
||||
bigType bigA[N][UNRL] = {};
|
||||
bigType bigB[YTILE][UNRL];
|
||||
@@ -1854,6 +1931,17 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
#pragma unroll
|
||||
for (uint32_t k2 = 0; k2 < UNRL; k2++) {
|
||||
for (uint32_t n = 0; n < N; n++) {
|
||||
#ifdef __HIP__GFX12__
|
||||
// gfx12: 4 x dot4 per A_CHUNK=16 bytes (4 FP8 per dot4)
|
||||
for (int y = 0; y < YTILE; ++y) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < A_CHUNK / 4; i++) {
|
||||
sum[n][y] = __builtin_amdgcn_dot4_f32_fp8_fp8(
|
||||
bigA[n][k2].i[i], bigB[y][k2].i[i], sum[n][y]);
|
||||
}
|
||||
}
|
||||
#else
|
||||
// gfx9: MFMA path
|
||||
for (int i = 0; i < A_CHUNK; i += 8) {
|
||||
for (int y = 0; y < YTILE; ++y) {
|
||||
sum[n][y] = __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8(
|
||||
@@ -1861,11 +1949,33 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
0);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Final reduction
|
||||
#ifdef __HIP__GFX12__
|
||||
// gfx12 wave32: DPP row_shr within 16-lane rows + cross-row shuffle
|
||||
for (int n = 0; n < N; n++) {
|
||||
for (int y = 0; y < YTILE; y++) {
|
||||
asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:8 bound_ctrl:0 "
|
||||
: "=v"(sum[n][y])
|
||||
: "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
|
||||
asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:4 bound_ctrl:0 "
|
||||
: "=v"(sum[n][y])
|
||||
: "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
|
||||
asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:2 bound_ctrl:0 "
|
||||
: "=v"(sum[n][y])
|
||||
: "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
|
||||
asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:1 bound_ctrl:0 "
|
||||
: "=v"(sum[n][y])
|
||||
: "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
|
||||
sum[n][y] += __shfl_xor(sum[n][y], 16);
|
||||
}
|
||||
}
|
||||
#else
|
||||
// gfx9 MFMA reduction
|
||||
for (int n = 0; n < N; n++) {
|
||||
for (int y = 0; y < YTILE; y++) {
|
||||
float accm0 = sum[n][y][0];
|
||||
@@ -1880,8 +1990,15 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
sum[n][y][0] = accm0;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
const bool writeback_lane =
|
||||
#ifdef __HIP__GFX12__
|
||||
threadIdx.x == (THRDS - 1);
|
||||
#else
|
||||
threadIdx.x == 0;
|
||||
#endif
|
||||
if (writeback_lane) {
|
||||
scalar_t biases[N][YTILE] = {};
|
||||
if (BIAS)
|
||||
for (int n = 0; n < N; n++) {
|
||||
@@ -1892,13 +2009,17 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
for (int n = 0; n < N; n++) {
|
||||
for (int y = 0; y < YTILE; y++) {
|
||||
if (y + m >= M) break; // To avoid mem access fault.
|
||||
sum[n][y][0] *= sA * sB;
|
||||
#ifdef __HIP__GFX12__
|
||||
float result = sum[n][y] * sA * sB;
|
||||
#else
|
||||
float result = sum[n][y][0] * sA * sB;
|
||||
#endif
|
||||
if constexpr (std::is_same_v<scalar_t, half>) {
|
||||
sum[n][y][0] += __half2float(biases[n][y]);
|
||||
result += __half2float(biases[n][y]);
|
||||
} else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
|
||||
sum[n][y][0] += __bfloat162float(biases[n][y]);
|
||||
result += __bfloat162float(biases[n][y]);
|
||||
}
|
||||
C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0]);
|
||||
C[m + y + n * M] = __float2s<scalar_t>(result);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1906,7 +2027,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
m += CuCount * _WvPrGrp * YTILE;
|
||||
}
|
||||
}
|
||||
#else // !defined(__HIP__MI3XX__) TODO: Add NAVI support
|
||||
#else // !defined(__HIP__MI3XX__) && !defined(__HIP__GFX12__)
|
||||
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
|
||||
int A_CHUNK, int UNRL, int N>
|
||||
__global__ void wvSplitKQ_hf_sml_(const int K, const int Kap, const int Kbp,
|
||||
@@ -1918,9 +2039,9 @@ __global__ void wvSplitKQ_hf_sml_(const int K, const int Kap, const int Kbp,
|
||||
const int _WvPrGrp, const int CuCount) {
|
||||
UNREACHABLE_CODE
|
||||
}
|
||||
#endif // defined(__HIP__MI3XX__) TODO: Add NAVI support
|
||||
#endif // defined(__HIP__MI3XX__) || defined(__HIP__GFX12__)
|
||||
|
||||
#if defined(__HIP__MI3XX__) // TODO: Add NAVI support
|
||||
#if defined(__HIP__MI3XX__) || defined(__HIP__GFX12__)
|
||||
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
|
||||
int A_CHUNK, int UNRL, int N>
|
||||
__global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
@@ -1963,12 +2084,17 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
|
||||
uint32_t m = (blockIdx.x * _WvPrGrp + (threadIdx.y % _WvPrGrp)) * YTILE;
|
||||
|
||||
using floatx16 = __attribute__((__vector_size__(16 * sizeof(float)))) float;
|
||||
float sA = *s_A;
|
||||
float sB = *s_B;
|
||||
|
||||
while (m < M) {
|
||||
#ifdef __HIP__GFX12__
|
||||
// gfx12: per-lane scalar accumulation via v_dot4_f32_fp8_fp8
|
||||
float sum[N][YTILE] = {};
|
||||
#else
|
||||
// gfx9: MFMA accumulation
|
||||
scalar8 sum[N][YTILE] = {};
|
||||
#endif
|
||||
for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
|
||||
bigType bigA[N][UNRL] = {};
|
||||
bigType bigB[YTILE][UNRL];
|
||||
@@ -2002,6 +2128,17 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
#pragma unroll
|
||||
for (uint32_t k2 = 0; k2 < UNRL; k2++) {
|
||||
for (uint32_t n = 0; n < N; n++) {
|
||||
#ifdef __HIP__GFX12__
|
||||
// gfx12: 4 x dot4 per A_CHUNK=16 bytes (4 FP8 per dot4)
|
||||
for (int y = 0; y < YTILE; ++y) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < A_CHUNK / 4; i++) {
|
||||
sum[n][y] = __builtin_amdgcn_dot4_f32_fp8_fp8(
|
||||
bigA[n][k2].i[i], bigB[y][k2].i[i], sum[n][y]);
|
||||
}
|
||||
}
|
||||
#else
|
||||
// gfx9: MFMA path
|
||||
for (int i = 0; i < A_CHUNK; i += 8) {
|
||||
for (int y = 0; y < YTILE; ++y) {
|
||||
sum[n][y] = __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8(
|
||||
@@ -2009,11 +2146,33 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
0);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Final reduction
|
||||
#ifdef __HIP__GFX12__
|
||||
// gfx12 wave32: DPP row_shr within 16-lane rows + cross-row shuffle
|
||||
for (int n = 0; n < N; n++) {
|
||||
for (int y = 0; y < YTILE; y++) {
|
||||
asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:8 bound_ctrl:0 "
|
||||
: "=v"(sum[n][y])
|
||||
: "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
|
||||
asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:4 bound_ctrl:0 "
|
||||
: "=v"(sum[n][y])
|
||||
: "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
|
||||
asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:2 bound_ctrl:0 "
|
||||
: "=v"(sum[n][y])
|
||||
: "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
|
||||
asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:1 bound_ctrl:0 "
|
||||
: "=v"(sum[n][y])
|
||||
: "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
|
||||
sum[n][y] += __shfl_xor(sum[n][y], 16);
|
||||
}
|
||||
}
|
||||
#else
|
||||
// gfx9 MFMA reduction
|
||||
for (int n = 0; n < N; n++) {
|
||||
for (int y = 0; y < YTILE; y++) {
|
||||
float accm0 = sum[n][y][0];
|
||||
@@ -2028,8 +2187,15 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
sum[n][y][0] = accm0;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
const bool writeback_lane =
|
||||
#ifdef __HIP__GFX12__
|
||||
threadIdx.x == (THRDS - 1);
|
||||
#else
|
||||
threadIdx.x == 0;
|
||||
#endif
|
||||
if (writeback_lane) {
|
||||
scalar_t biases[N][YTILE] = {};
|
||||
if (BIAS)
|
||||
for (int n = 0; n < N; n++) {
|
||||
@@ -2040,13 +2206,17 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
for (int n = 0; n < N; n++) {
|
||||
for (int y = 0; y < YTILE; y++) {
|
||||
if (y + m >= M) break; // To avoid mem access fault.
|
||||
sum[n][y][0] *= sA * sB;
|
||||
#ifdef __HIP__GFX12__
|
||||
float result = sum[n][y] * sA * sB;
|
||||
#else
|
||||
float result = sum[n][y][0] * sA * sB;
|
||||
#endif
|
||||
if constexpr (std::is_same_v<scalar_t, half>) {
|
||||
sum[n][y][0] += __half2float(biases[n][y]);
|
||||
result += __half2float(biases[n][y]);
|
||||
} else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
|
||||
sum[n][y][0] += __bfloat162float(biases[n][y]);
|
||||
result += __bfloat162float(biases[n][y]);
|
||||
}
|
||||
C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0]);
|
||||
C[m + y + n * M] = __float2s<scalar_t>(result);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2054,7 +2224,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
m += CuCount * _WvPrGrp * YTILE;
|
||||
}
|
||||
}
|
||||
#else // !defined(__HIP__MI3XX__) TODO: Add NAVI support
|
||||
#else // !defined(__HIP__MI3XX__) && !defined(__HIP__GFX12__)
|
||||
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
|
||||
int A_CHUNK, int UNRL, int N>
|
||||
__global__ void wvSplitKQ_hf_(const int K, const int Kap, const int Kbp,
|
||||
@@ -2066,7 +2236,7 @@ __global__ void wvSplitKQ_hf_(const int K, const int Kap, const int Kbp,
|
||||
const int CuCount) {
|
||||
UNREACHABLE_CODE
|
||||
}
|
||||
#endif // defined(__HIP__MI3XX__) TODO: Add NAVI support
|
||||
#endif // defined(__HIP__MI3XX__) || defined(__HIP__GFX12__)
|
||||
|
||||
void wvSplitKQ(const at::Tensor& in_b, const at::Tensor& in_a,
|
||||
const std::optional<at::Tensor>& in_bias, at::Tensor& out_c,
|
||||
@@ -2099,24 +2269,30 @@ void wvSplitKQ(const at::Tensor& in_b, const at::Tensor& in_a,
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
const int max_lds_len = get_lds_size();
|
||||
|
||||
#define WVSPLITKQ(_WvPrGrp, _YTILEs, _YTILEm, _UNRLs, _UNRLm, _N) \
|
||||
{ \
|
||||
dim3 block(64, _WvPrGrp); \
|
||||
if ((Kap_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) { \
|
||||
int __wvPrGrp = min(_WvPrGrp, mindiv(M_in, CuCount * _YTILEs, 16)); \
|
||||
wvSplitKQ_hf_sml_<fptype, fp8_t, 64, _YTILEs, _WvPrGrp, 16, _UNRLs, _N> \
|
||||
<<<grid, block, 0, stream>>>(K_in, Kap_in, Kbp_in, M_in, Bx_in, \
|
||||
By_in, b_ptr, a_ptr, bias_ptr, c_ptr, \
|
||||
s_a, s_b, __wvPrGrp, CuCount); \
|
||||
} else { \
|
||||
int __wvPrGrp = min(_WvPrGrp, mindiv(M_in, CuCount * _YTILEm, 16)); \
|
||||
wvSplitKQ_hf_<fptype, fp8_t, 64, _YTILEm, _WvPrGrp, 16, _UNRLm, _N> \
|
||||
<<<grid, block, 0, stream>>>(K_in, Kap_in, Kbp_in, M_in, Bx_in, \
|
||||
By_in, b_ptr, a_ptr, bias_ptr, c_ptr, \
|
||||
s_a, s_b, __wvPrGrp, CuCount); \
|
||||
} \
|
||||
#define WVSPLITKQ_IMPL(_THRDS, _WvPrGrp, _YTILEs, _YTILEm, _UNRLs, _UNRLm, _N) \
|
||||
{ \
|
||||
dim3 block(_THRDS, _WvPrGrp); \
|
||||
if ((Kap_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) { \
|
||||
int __wvPrGrp = min(_WvPrGrp, mindiv(M_in, CuCount * _YTILEs, 16)); \
|
||||
wvSplitKQ_hf_sml_<fptype, fp8_t, _THRDS, _YTILEs, _WvPrGrp, 16, _UNRLs, \
|
||||
_N><<<grid, block, 0, stream>>>( \
|
||||
K_in, Kap_in, Kbp_in, M_in, Bx_in, By_in, b_ptr, a_ptr, bias_ptr, \
|
||||
c_ptr, s_a, s_b, __wvPrGrp, CuCount); \
|
||||
} else { \
|
||||
int __wvPrGrp = min(_WvPrGrp, mindiv(M_in, CuCount * _YTILEm, 16)); \
|
||||
wvSplitKQ_hf_<fptype, fp8_t, _THRDS, _YTILEm, _WvPrGrp, 16, _UNRLm, _N> \
|
||||
<<<grid, block, 0, stream>>>(K_in, Kap_in, Kbp_in, M_in, Bx_in, \
|
||||
By_in, b_ptr, a_ptr, bias_ptr, c_ptr, \
|
||||
s_a, s_b, __wvPrGrp, CuCount); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define WVSPLITKQ(_WvPrGrp, _YTILEs, _YTILEm, _UNRLs, _UNRLm, _N) \
|
||||
if (on_gfx12()) \
|
||||
WVSPLITKQ_IMPL(32, _WvPrGrp, _YTILEs, _YTILEm, _UNRLs, _UNRLm, _N) \
|
||||
else \
|
||||
WVSPLITKQ_IMPL(64, _WvPrGrp, _YTILEs, _YTILEm, _UNRLs, _UNRLm, _N)
|
||||
|
||||
AT_DISPATCH_REDUCED_FLOATING_TYPES(out_c.scalar_type(), "wvSplitKQ", [&] {
|
||||
using fptype = typename scalar<scalar_t>::type;
|
||||
auto c_ptr = reinterpret_cast<fptype*>(out_c.data_ptr());
|
||||
@@ -2136,10 +2312,10 @@ void wvSplitKQ(const at::Tensor& in_b, const at::Tensor& in_a,
|
||||
WVSPLITKQ(16, 2, 2, 2, 2, 2)
|
||||
break;
|
||||
case 3:
|
||||
WVSPLITKQ(16, 2, 2, 2, 2, 3)
|
||||
WVSPLITKQ(16, 2, 2, 1, 1, 3)
|
||||
break;
|
||||
case 4:
|
||||
WVSPLITKQ(16, 2, 2, 2, 2, 4)
|
||||
WVSPLITKQ(16, 2, 2, 1, 1, 4)
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
|
||||
@@ -303,9 +303,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
") -> Tensor");
|
||||
// conditionally compiled so impl registration is in source file
|
||||
|
||||
ops.def("permute_cols(Tensor A, Tensor perm) -> Tensor");
|
||||
ops.impl("permute_cols", torch::kCUDA, &permute_cols);
|
||||
|
||||
// Marlin Optimized Quantized GEMM (supports GPTQ, AWQ, FP8, NVFP4, MXFP4).
|
||||
ops.def(
|
||||
"marlin_gemm(Tensor a, Tensor? c_or_none, Tensor b_q_weight, "
|
||||
@@ -489,8 +486,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
" Tensor! problem_sizes1, Tensor! problem_sizes2, "
|
||||
" Tensor! input_permutation, "
|
||||
" Tensor! output_permutation, int num_experts, "
|
||||
" int n, int k, Tensor? blockscale_offsets) -> "
|
||||
"()");
|
||||
" int n, int k, Tensor? blockscale_offsets, "
|
||||
" bool is_gated) -> ()");
|
||||
ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data);
|
||||
|
||||
// compute per-expert problem sizes from expert_first_token_offset
|
||||
@@ -564,10 +561,21 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
|
||||
// Compute NVFP4 block quantized tensor.
|
||||
ops.def(
|
||||
"scaled_fp4_quant(Tensor! output, Tensor input,"
|
||||
" Tensor! output_scale, Tensor input_scale, bool "
|
||||
"is_sf_swizzled_layout) -> ()");
|
||||
ops.impl("scaled_fp4_quant", torch::kCUDA, &scaled_fp4_quant);
|
||||
"scaled_fp4_quant(Tensor input,"
|
||||
" Tensor input_scale, bool "
|
||||
"is_sf_swizzled_layout) -> (Tensor, Tensor)");
|
||||
ops.impl("scaled_fp4_quant", torch::kCUDA, &scaled_fp4_quant_func);
|
||||
|
||||
// Out variant
|
||||
// TODO: Add {at::Tag::out_variant} tag and update all call sites
|
||||
// to use the functional variant once vLLM upgrades PyTorch.
|
||||
// See pytorch/pytorch#176117.
|
||||
ops.def(
|
||||
"scaled_fp4_quant.out(Tensor input,"
|
||||
" Tensor input_scale, bool "
|
||||
"is_sf_swizzled_layout, *, Tensor(a!) output, Tensor(b!) output_scale) "
|
||||
"-> ()");
|
||||
ops.impl("scaled_fp4_quant.out", torch::kCUDA, &scaled_fp4_quant_out);
|
||||
|
||||
// Compute NVFP4 experts quantization.
|
||||
ops.def(
|
||||
|
||||
@@ -44,7 +44,7 @@ ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
# Install Python and other dependencies
|
||||
RUN apt-get update -y \
|
||||
&& apt-get install -y software-properties-common git curl sudo vim less libgfortran5 libopenmpi-dev libpci-dev \
|
||||
&& apt-get install -y software-properties-common git curl sudo vim less libgfortran5 libopenmpi-dev libpci-dev liblzma-dev pkg-config \
|
||||
&& for i in 1 2 3; do \
|
||||
add-apt-repository -y ppa:deadsnakes/ppa && break || \
|
||||
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
|
||||
|
||||
@@ -76,19 +76,22 @@ ENV UV_LINK_MODE="copy"
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,src=requirements/common.txt,target=/workspace/vllm/requirements/common.txt \
|
||||
--mount=type=bind,src=requirements/xpu.txt,target=/workspace/vllm/requirements/xpu.txt \
|
||||
--mount=type=bind,src=requirements/xpu-test.in,target=/workspace/vllm/requirements/xpu-test.in \
|
||||
uv pip install --upgrade pip && \
|
||||
uv pip install -r requirements/xpu.txt
|
||||
|
||||
# used for suffix method speculative decoding
|
||||
# build deps for proto + nanobind-based extensions to set up the build environment
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install grpcio-tools protobuf nanobind
|
||||
# arctic-inference is built from source which needs torch-xpu properly installed first
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install -r requirements/xpu.txt && \
|
||||
uv pip compile /workspace/vllm/requirements/xpu-test.in \
|
||||
-o /workspace/vllm/requirements/xpu-test.txt \
|
||||
-c /workspace/vllm/requirements/xpu.txt \
|
||||
--index-strategy unsafe-best-match \
|
||||
--extra-index-url ${PIP_EXTRA_INDEX_URL} \
|
||||
--python-version ${PYTHON_VERSION} && \
|
||||
uv pip install grpcio-tools protobuf nanobind && \
|
||||
source /opt/intel/oneapi/setvars.sh --force && \
|
||||
source /opt/intel/oneapi/ccl/2021.15/env/vars.sh --force && \
|
||||
export CMAKE_PREFIX_PATH="$(python -c 'import site; print(site.getsitepackages()[0])'):${CMAKE_PREFIX_PATH}" && \
|
||||
uv pip install --no-build-isolation arctic-inference==0.1.1
|
||||
export CMAKE_PREFIX_PATH="$(python3 -c 'import site; print(site.getsitepackages()[0])'):${CMAKE_PREFIX_PATH}" && \
|
||||
uv pip install --no-build-isolation -r /workspace/vllm/requirements/xpu-test.txt
|
||||
|
||||
|
||||
|
||||
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@ nav:
|
||||
- Models:
|
||||
- models/supported_models.md
|
||||
- models/generative_models.md
|
||||
- models/pooling_models.md
|
||||
- Pooling Models: models/pooling_models
|
||||
- models/extensions
|
||||
- Hardware Supported Models:
|
||||
- models/hardware_supported_models/*
|
||||
|
||||
@@ -37,7 +37,7 @@ For [generative models](../../models/generative_models.md), there are two levels
|
||||
|
||||
#### Pooling models
|
||||
|
||||
For [pooling models](../../models/pooling_models.md), we simply check the cosine similarity, as defined in [tests/models/utils.py](../../../tests/models/utils.py).
|
||||
For [pooling models](../../models/pooling_models/README.md), we simply check the cosine similarity, as defined in [tests/models/utils.py](../../../tests/models/utils.py).
|
||||
|
||||
### Multi-modal processing
|
||||
|
||||
|
||||
@@ -3,6 +3,10 @@
|
||||
!!! warning
|
||||
Profiling is only intended for vLLM developers and maintainers to understand the proportion of time spent in different parts of the codebase. **vLLM end-users should never turn on profiling** as it will significantly slow down the inference.
|
||||
|
||||
!!! tip "Choosing a profiler"
|
||||
- Use **Nsight Systems** for low-overhead, performance-critical profiling.
|
||||
- Use **PyTorch Profiler** for medium-overhead profiling with richer debugging information (e.g., stack traces, memory, shapes). Note that enabling these features adds overhead and is not recommended for benchmarking.
|
||||
|
||||
## Profile with PyTorch Profiler
|
||||
|
||||
We support tracing vLLM workers using different profilers. You can enable profiling by setting the `--profiler-config` flag when launching the server.
|
||||
|
||||
@@ -127,8 +127,8 @@ Priority is **1 = highest** (tried first).
|
||||
| 3 | `FLASH_ATTN_MLA` |
|
||||
| 4 | `FLASHMLA` |
|
||||
| 5 | `TRITON_MLA` |
|
||||
| 6 | `FLASHMLA_SPARSE` |
|
||||
| 7 | `FLASHINFER_MLA_SPARSE` |
|
||||
| 6 | `FLASHINFER_MLA_SPARSE`**\*** |
|
||||
| 7 | `FLASHMLA_SPARSE` |
|
||||
|
||||
**Ampere/Hopper (SM 8.x-9.x):**
|
||||
|
||||
@@ -140,6 +140,8 @@ Priority is **1 = highest** (tried first).
|
||||
| 4 | `TRITON_MLA` |
|
||||
| 5 | `FLASHMLA_SPARSE` |
|
||||
|
||||
> **\*** For sparse MLA, FP8 KV cache always prefers `FLASHINFER_MLA_SPARSE`. With BF16 KV cache, `FLASHINFER_MLA_SPARSE` is preferred for low query-head counts (<= 16), while `FLASHMLA_SPARSE` is preferred otherwise.
|
||||
>
|
||||
> **Note:** ROCm and CPU platforms have their own selection logic. See the platform-specific documentation for details.
|
||||
|
||||
## Legend
|
||||
|
||||
@@ -51,11 +51,8 @@ For example:
|
||||
**1. Attention:**
|
||||
|
||||
```python
|
||||
--8<-- "vllm/model_executor/layers/attention/mm_encoder_attention.py:mm_encoder_attn"
|
||||
|
||||
--8<-- "vllm/model_executor/layers/mla.py:multi_head_latent_attention"
|
||||
|
||||
--8<-- "vllm/model_executor/models/deepencoder.py:rel_pos_attention"
|
||||
```
|
||||
|
||||
**2. Activation:**
|
||||
@@ -170,6 +167,16 @@ For example:
|
||||
--8<-- "vllm/model_executor/layers/rotary_embedding/common.py:apply_rotary_emb"
|
||||
```
|
||||
|
||||
**12. Encoder:**
|
||||
|
||||
```python
|
||||
--8<-- "vllm/model_executor/models/deepencoder2.py:qwen2_decoder"
|
||||
|
||||
--8<-- "vllm/model_executor/layers/attention/mm_encoder_attention.py:mm_encoder_attn"
|
||||
|
||||
--8<-- "vllm/model_executor/models/deepencoder.py:rel_pos_attention"
|
||||
```
|
||||
|
||||
## Guidelines for Implementing a New CustomOp
|
||||
|
||||
### Implement a New CustomOp in vLLM
|
||||
|
||||
@@ -88,8 +88,8 @@ To be used with a particular `FusedMoEPrepareAndFinalizeModular` subclass, MoE k
|
||||
| flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
|
||||
| gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] |
|
||||
| marlin | standard,</br>batched | <sup>3</sup> / N/A | <sup>3</sup> / N/A | silu,</br>swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] |
|
||||
| trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] |
|
||||
| rocm aiter moe | standard | fp8 | G(128),A,T | silu, gelu | Y | N | [`rocm_aiter_fused_experts`][vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe.rocm_aiter_fused_experts] |
|
||||
| trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmMxfp4ExpertsMonolithic`][vllm.model_executor.layers.fused_moe.experts.trtllm_mxfp4_moe.TrtLlmMxfp4ExpertsMonolithic],</br>[`TrtLlmMxfp4ExpertsModular`][vllm.model_executor.layers.fused_moe.experts.trtllm_mxfp4_moe.TrtLlmMxfp4ExpertsModular],</br>[`TrtLlmNvFp4ExpertsMonolithic`][vllm.model_executor.layers.fused_moe.experts.trtllm_nvfp4_moe.TrtLlmNvFp4ExpertsMonolithic],</br>[`TrtLlmNvfp4ExpertsModular`][vllm.model_executor.layers.fused_moe.experts.trtllm_nvfp4_moe.TrtLlmNvFp4ExpertsModular] |
|
||||
| rocm aiter moe | standard | mxfp4,</br>fp8 | G(32),G(128),A,T | silu, gelu,</br>swigluoai | Y | N | `rocm_aiter_fused_experts`,</br>`AiterExperts` |
|
||||
| cpu_fused_moe | standard | N/A | N/A | silu | N | N | [`CPUFusedMOE`][vllm.model_executor.layers.fused_moe.cpu_fused_moe.CPUFusedMOE] |
|
||||
| naive batched<sup>4</sup> | batched | int8,</br>fp8 | G,A,T | silu, gelu | <sup>6</sup> | Y | [`NaiveBatchedExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.NaiveBatchedExperts] |
|
||||
|
||||
@@ -103,7 +103,7 @@ To be used with a particular `FusedMoEPrepareAndFinalizeModular` subclass, MoE k
|
||||
|
||||
## Modular Kernel "families"
|
||||
|
||||
The following table shows "families" of modular kernels that are intended to work together. There are some combinations which may work but have not yet been tested, e.g. flashinfer with other fp8 experts. Note that the "naive" backend will work with any non-modular experts.
|
||||
The following table shows "families" of modular kernels that are intended to work together. There are some combinations which may work but have not yet been tested, e.g. flashinfer with other fp8 experts.
|
||||
|
||||
| backend | `FusedMoEPrepareAndFinalizeModular` subclasses | `FusedMoEExpertsModular` subclasses |
|
||||
| ------- | ---------------------------------------------- | ----------------------------------- |
|
||||
|
||||
@@ -29,10 +29,9 @@ To compile a multimodal component such as an encoder, we follow the same mechani
|
||||
1. The `@support_torch_compile` decorator should include `enable_if=should_torch_compile_mm_encoder`. This will gate the compilation behind our
|
||||
`compile_mm_encoder` configuration
|
||||
|
||||
2. `with set_model_tag("<component_name>", is_encoder=True)` context manager should be used around the nn.Module's instantiation. Since torch.compile
|
||||
relies on caching artifacts to reduce start time, we must properly propagate the `<component_name>` information to the cache in order to avoid collisions
|
||||
with the LLM text-backbone, or other instances of the same artifact (as is the case with vision block). `is_encoder=True` is also needed for encoder
|
||||
components (see Compile Range Integration).
|
||||
2. The `@support_torch_compile` decorator should include `is_encoder=True` for encoder components. This is needed for compile range integration
|
||||
(see Compile Range Integration). The decorator automatically uses the class name as the cache directory prefix, avoiding collisions between
|
||||
independently compiled sub-modules (e.g. vision encoder components vs the text backbone).
|
||||
|
||||
### CompilationConfig
|
||||
|
||||
@@ -57,8 +56,8 @@ tradeoff
|
||||
### Compile ranges
|
||||
|
||||
The torch.compile integration will try to rely on max_batch_size to infer compilation ranges for dynamic shapes; however, for modules used in the encoder, this
|
||||
shape can be difficult to infer due to the unspecified range of shapes the encoder may see as input. Therefore, we rely on `is_encoder=True` in the `set_model_tag`
|
||||
to alert torch.compile to the fact that this range cannot be inferred, and we default to the range (1, MAX_INT).
|
||||
shape can be difficult to infer due to the unspecified range of shapes the encoder may see as input. Therefore, we rely on `is_encoder=True` in the
|
||||
`@support_torch_compile` decorator to alert torch.compile to the fact that this range cannot be inferred, and we default to the range (1, MAX_INT).
|
||||
|
||||
!!! note
|
||||
We may seek to tighten this range for better performance in the future
|
||||
|
||||
@@ -36,14 +36,14 @@ th:not(:first-child) {
|
||||
}
|
||||
</style>
|
||||
|
||||
| Feature | [CP](../configuration/optimization.md#chunked-prefill) | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](speculative_decoding/README.md) | CUDA graph | [pooling](../models/pooling_models.md) | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search | [prompt-embeds](prompt_embeds.md) |
|
||||
| Feature | [CP](../configuration/optimization.md#chunked-prefill) | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](speculative_decoding/README.md) | CUDA graph | [pooling](../models/pooling_models/README.md) | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search | [prompt-embeds](prompt_embeds.md) |
|
||||
| - | - | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
|
||||
| [CP](../configuration/optimization.md#chunked-prefill) | ✅ | | | | | | | | | | | | | | |
|
||||
| [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | | |
|
||||
| [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | |
|
||||
| [SD](speculative_decoding/README.md) | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | | |
|
||||
| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | |
|
||||
| [pooling](../models/pooling_models.md) | 🟠\* | 🟠\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | | |
|
||||
| [pooling](../models/pooling_models/README.md) | 🟠\* | 🟠\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | | |
|
||||
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [❌](https://github.com/vllm-project/vllm/issues/7366) | ❌ | [❌](https://github.com/vllm-project/vllm/issues/7366) | ✅ | ✅ | ✅ | | | | | | | | |
|
||||
| <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | |
|
||||
| <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | | |
|
||||
@@ -66,7 +66,7 @@ th:not(:first-child) {
|
||||
| [LoRA](lora.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| [SD](speculative_decoding/README.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/26970) |
|
||||
| [pooling](../models/pooling_models.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| [pooling](../models/pooling_models/README.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ |
|
||||
| [mm](multimodal_inputs.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ |
|
||||
|
||||
@@ -389,3 +389,17 @@ vllm serve model --enable-lora --max-lora-rank 64
|
||||
# Bad: unnecessarily high, wastes memory
|
||||
vllm serve model --enable-lora --max-lora-rank 256
|
||||
```
|
||||
|
||||
### Restricting LoRA to Specific Modules
|
||||
|
||||
The `--lora-target-modules` parameter allows you to restrict which model modules have LoRA applied at deployment time. This is useful for performance tuning when you only need LoRA on specific layers:
|
||||
|
||||
```bash
|
||||
# Apply LoRA only to output projection layers
|
||||
vllm serve model --enable-lora --lora-target-modules o_proj
|
||||
|
||||
# Apply LoRA to multiple specific modules
|
||||
vllm serve model --enable-lora --lora-target-modules o_proj qkv_proj down_proj
|
||||
```
|
||||
|
||||
When `--lora-target-modules` is not specified, LoRA will be applied to all supported modules in the model. This parameter accepts module suffixes (the last component of the module name), such as `o_proj`, `qkv_proj`, `gate_proj`, etc.
|
||||
|
||||
@@ -5,7 +5,7 @@ vLLM offers support for reasoning models like [DeepSeek R1](https://huggingface.
|
||||
Reasoning models return an additional `reasoning` field in their outputs, which contains the reasoning steps that led to the final conclusion. This field is not present in the outputs of other models.
|
||||
|
||||
!!! warning
|
||||
`reasoning` used to be called `reasoning_content`. For now, `reasoning_content` will continue to work. However, we encourage you to migrate to `reasoning` in case `reasoning_content` is removed in future.
|
||||
`reasoning` used to be called `reasoning_content`. To migrate, directly replace `reasoning_content` with `reasoning`.
|
||||
|
||||
## Supported Models
|
||||
|
||||
|
||||
@@ -107,6 +107,27 @@ vLLM supports the `tool_choice='none'` option in the chat completion API. When t
|
||||
!!! note
|
||||
When tools are specified in the request, vLLM includes tool definitions in the prompt by default, regardless of the `tool_choice` setting. To exclude tool definitions when `tool_choice='none'`, use the `--exclude-tools-when-tool-choice-none` option.
|
||||
|
||||
## Constrained Decoding Behavior
|
||||
|
||||
Whether vLLM enforces the tool parameter schema during generation depends on the `tool_choice` mode:
|
||||
|
||||
| `tool_choice` value | Schema-constrained decoding | Behavior |
|
||||
| --- | --- | --- |
|
||||
| Named function | Yes (via structured outputs backend) | Arguments are guaranteed to be valid JSON conforming to the function's parameter schema. |
|
||||
| `"required"` | Yes (via structured outputs backend) | Same as named function. The model must produce at least one tool call. |
|
||||
| `"auto"` | No | The model generates freely. A tool-call parser extracts tool calls from the raw text. Arguments may be malformed or not match the schema. |
|
||||
| `"none"` | N/A | No tool calls are produced. |
|
||||
|
||||
When schema conformance matters, prefer `tool_choice="required"` or named function calling over `"auto"`.
|
||||
|
||||
### Strict Mode (`strict` parameter)
|
||||
|
||||
The [OpenAI API](https://platform.openai.com/docs/guides/function-calling#strict-mode) supports a `strict` field on function definitions. When set to `true`, OpenAI uses constrained decoding to guarantee that tool-call arguments match the function schema, even in `tool_choice="auto"` mode.
|
||||
|
||||
vLLM **does not implement** `strict` mode today. The `strict` field is accepted in requests (to avoid breaking clients that set it), but it has no effect on decoding behavior. In auto mode, argument validity depends entirely on the model's output quality and the parser's extraction logic.
|
||||
|
||||
Tracking issues: [#15526](https://github.com/vllm-project/vllm/issues/15526), [#16313](https://github.com/vllm-project/vllm/issues/16313).
|
||||
|
||||
## Automatic Function Calling
|
||||
|
||||
To enable this feature, you should set the following flags:
|
||||
@@ -124,6 +145,9 @@ from HuggingFace; and you can find an example of this in a `tokenizer_config.jso
|
||||
|
||||
If your favorite tool-calling model is not supported, please feel free to contribute a parser & tool use chat template!
|
||||
|
||||
!!! note
|
||||
With `tool_choice="auto"`, tool-call arguments are extracted from the model's raw text output by the selected parser. No schema-level constraint is applied during decoding, so arguments may occasionally be malformed or violate the function's parameter schema. See [Constrained Decoding Behavior](#constrained-decoding-behavior) for details.
|
||||
|
||||
### Hermes Models (`hermes`)
|
||||
|
||||
All Nous Research Hermes-series models newer than Hermes 2 Pro should be supported.
|
||||
|
||||
@@ -23,15 +23,18 @@ def title(text: str) -> str:
|
||||
# Custom substitutions
|
||||
subs = {
|
||||
"io": "IO",
|
||||
"api": "API",
|
||||
"rl": "RL",
|
||||
"api(s?)": r"API\1",
|
||||
"cli": "CLI",
|
||||
"cpu": "CPU",
|
||||
"ipc": "IPC",
|
||||
"llm": "LLM",
|
||||
"mae": "MAE",
|
||||
"ner": "NER",
|
||||
"tpu": "TPU",
|
||||
"gguf": "GGUF",
|
||||
"lora": "LoRA",
|
||||
"nccl": "NCCL",
|
||||
"rlhf": "RLHF",
|
||||
"vllm": "vLLM",
|
||||
"openai": "OpenAI",
|
||||
@@ -196,6 +199,11 @@ class Example:
|
||||
|
||||
|
||||
def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
||||
# Monkey-patch dirname_to_title in awesome-nav so that sub-directory names are
|
||||
# title-cased (e.g. "Offline Inference" instead of "Offline inference").
|
||||
import mkdocs_awesome_nav.nav.directory as _nav_dir
|
||||
|
||||
_nav_dir.dirname_to_title = title
|
||||
logger.info("Generating example documentation")
|
||||
logger.debug("Root directory: %s", ROOT_DIR.resolve())
|
||||
logger.debug("Example directory: %s", EXAMPLE_DIR.resolve())
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
MkDocs hook to enable the following links to render correctly:
|
||||
MkDocs hook + markdown extension to enable the following links to render correctly,
|
||||
including inside content included via pymdownx.snippets:
|
||||
|
||||
- Relative file links outside of the `docs/` directory, e.g.:
|
||||
- [Text](../some_file.py)
|
||||
@@ -12,13 +13,17 @@ MkDocs hook to enable the following links to render correctly:
|
||||
e.g. <...pull/123> -> [Pull Request #123](.../pull/123)
|
||||
- Works for external repos too by including the `owner/repo` in the link title
|
||||
|
||||
The goal is to simplify cross-referencing common GitHub resources
|
||||
in project docs.
|
||||
The link replacement runs as a markdown preprocessor (priority 25) so that it executes
|
||||
after pymdownx.snippets (priority 32) has expanded all included content.
|
||||
The on_page_markdown hook passes the current page context to the preprocessor before
|
||||
each page is converted.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import regex as re
|
||||
from markdown import Extension
|
||||
from markdown.preprocessors import Preprocessor
|
||||
from mkdocs.config.defaults import MkDocsConfig
|
||||
from mkdocs.structure.files import Files
|
||||
from mkdocs.structure.pages import Page
|
||||
@@ -26,7 +31,6 @@ from mkdocs.structure.pages import Page
|
||||
ROOT_DIR = Path(__file__).parent.parent.parent.parent.resolve()
|
||||
DOC_DIR = ROOT_DIR / "docs"
|
||||
|
||||
|
||||
gh_icon = ":octicons-mark-github-16:"
|
||||
|
||||
# Regex pieces
|
||||
@@ -48,46 +52,90 @@ github_link = re.compile(rf"(\[{TITLE}\]\(|<){URL}(\)|>)")
|
||||
relative_link = re.compile(rf"\[{TITLE}\]\({RELATIVE}\)")
|
||||
|
||||
|
||||
class UrlSchemesPreprocessor(Preprocessor):
|
||||
"""Preprocessor that runs after pymdownx.snippets to process all links."""
|
||||
|
||||
def __init__(self, md, ext):
|
||||
super().__init__(md)
|
||||
self.ext = ext
|
||||
|
||||
def run(self, lines):
|
||||
page = self.ext.page
|
||||
if page is None or getattr(page.file, "abs_src_path", None) is None:
|
||||
return lines
|
||||
|
||||
def replace_relative_link(match: re.Match) -> str:
|
||||
"""
|
||||
Replace relative file links with URLs if they point outside the docs dir.
|
||||
"""
|
||||
title = match.group("title")
|
||||
path = match.group("path")
|
||||
path = (Path(page.file.abs_src_path).parent / path).resolve()
|
||||
fragment = match.group("fragment") or ""
|
||||
|
||||
# Check if the path exists and is outside the docs dir
|
||||
if not path.exists() or path.is_relative_to(DOC_DIR):
|
||||
return match.group(0)
|
||||
|
||||
# Files and directories have different URL schemes on GitHub
|
||||
slug = "tree/main" if path.is_dir() else "blob/main"
|
||||
|
||||
path = path.relative_to(ROOT_DIR)
|
||||
url = f"https://github.com/vllm-project/vllm/{slug}/{path}{fragment}"
|
||||
return f"[{gh_icon} {title}]({url})"
|
||||
|
||||
def replace_github_link(match: re.Match) -> str:
|
||||
"""
|
||||
Replace GitHub issue, PR, and project links with enhanced Markdown links.
|
||||
"""
|
||||
repo = match.group("repo")
|
||||
type = match.group("type")
|
||||
number = match.group("number")
|
||||
# Title and fragment could be None
|
||||
title = match.group("title") or ""
|
||||
fragment = match.group("fragment") or ""
|
||||
|
||||
# Use default titles for raw links
|
||||
if not title:
|
||||
title = TITLES[type]
|
||||
if "vllm-project" not in repo:
|
||||
title += repo
|
||||
title += f"#{number}"
|
||||
|
||||
url = f"https://github.com/{repo}/{type}/{number}{fragment}"
|
||||
return f"[{gh_icon} {title}]({url})"
|
||||
|
||||
markdown = "\n".join(lines)
|
||||
markdown = relative_link.sub(replace_relative_link, markdown)
|
||||
markdown = github_link.sub(replace_github_link, markdown)
|
||||
return markdown.split("\n")
|
||||
|
||||
|
||||
class UrlSchemesExtension(Extension):
|
||||
"""Markdown extension that registers the URL schemes preprocessor."""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
self.page = None
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def extendMarkdown(self, md):
|
||||
# Priority 25 runs after pymdownx.snippets (priority 32)
|
||||
md.preprocessors.register(UrlSchemesPreprocessor(md, self), "url_schemes", 25)
|
||||
|
||||
|
||||
# Singleton extension instance shared between the hook and the preprocessor.
|
||||
_ext = UrlSchemesExtension()
|
||||
|
||||
|
||||
def on_config(config: MkDocsConfig) -> MkDocsConfig:
|
||||
"""Register the URL schemes markdown extension."""
|
||||
config["markdown_extensions"].append(_ext)
|
||||
return config
|
||||
|
||||
|
||||
def on_page_markdown(
|
||||
markdown: str, *, page: Page, config: MkDocsConfig, files: Files
|
||||
) -> str:
|
||||
def replace_relative_link(match: re.Match) -> str:
|
||||
"""Replace relative file links with URLs if they point outside the docs dir."""
|
||||
title = match.group("title")
|
||||
path = match.group("path")
|
||||
path = (Path(page.file.abs_src_path).parent / path).resolve()
|
||||
fragment = match.group("fragment") or ""
|
||||
|
||||
# Check if the path exists and is outside the docs dir
|
||||
if not path.exists() or path.is_relative_to(DOC_DIR):
|
||||
return match.group(0)
|
||||
|
||||
# Files and directories have different URL schemes on GitHub
|
||||
slug = "tree/main" if path.is_dir() else "blob/main"
|
||||
|
||||
path = path.relative_to(ROOT_DIR)
|
||||
url = f"https://github.com/vllm-project/vllm/{slug}/{path}{fragment}"
|
||||
return f"[{gh_icon} {title}]({url})"
|
||||
|
||||
def replace_github_link(match: re.Match) -> str:
|
||||
"""Replace GitHub issue, PR, and project links with enhanced Markdown links."""
|
||||
repo = match.group("repo")
|
||||
type = match.group("type")
|
||||
number = match.group("number")
|
||||
# Title and fragment could be None
|
||||
title = match.group("title") or ""
|
||||
fragment = match.group("fragment") or ""
|
||||
|
||||
# Use default titles for raw links
|
||||
if not title:
|
||||
title = TITLES[type]
|
||||
if "vllm-project" not in repo:
|
||||
title += repo
|
||||
title += f"#{number}"
|
||||
|
||||
url = f"https://github.com/{repo}/{type}/{number}{fragment}"
|
||||
return f"[{gh_icon} {title}]({url})"
|
||||
|
||||
markdown = relative_link.sub(replace_relative_link, markdown)
|
||||
markdown = github_link.sub(replace_github_link, markdown)
|
||||
"""Pass the current page context to the preprocessor."""
|
||||
_ext.page = page
|
||||
return markdown
|
||||
|
||||
@@ -1,676 +0,0 @@
|
||||
# Pooling Models
|
||||
|
||||
vLLM also supports pooling models, such as embedding, classification, and reward models.
|
||||
|
||||
In vLLM, pooling models implement the [VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface.
|
||||
These models use a [Pooler][vllm.model_executor.layers.pooler.Pooler] to extract the final hidden states of the input
|
||||
before returning them.
|
||||
|
||||
!!! note
|
||||
We currently support pooling models primarily for convenience. This is not guaranteed to provide any performance improvements over using Hugging Face Transformers or Sentence Transformers directly.
|
||||
|
||||
We plan to optimize pooling models in vLLM. Please comment on <https://github.com/vllm-project/vllm/issues/21796> if you have any suggestions!
|
||||
|
||||
## Configuration
|
||||
|
||||
### Model Runner
|
||||
|
||||
Run a model in pooling mode via the option `--runner pooling`.
|
||||
|
||||
!!! tip
|
||||
There is no need to set this option in the vast majority of cases as vLLM can automatically
|
||||
detect the appropriate model runner via `--runner auto`.
|
||||
|
||||
### Model Conversion
|
||||
|
||||
vLLM can adapt models for various pooling tasks via the option `--convert <type>`.
|
||||
|
||||
If `--runner pooling` has been set (manually or automatically) but the model does not implement the
|
||||
[VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface,
|
||||
vLLM will attempt to automatically convert the model according to the architecture names
|
||||
shown in the table below.
|
||||
|
||||
| Architecture | `--convert` | Supported pooling tasks |
|
||||
| ----------------------------------------------- | ----------- | ------------------------------------- |
|
||||
| `*ForTextEncoding`, `*EmbeddingModel`, `*Model` | `embed` | `token_embed`, `embed` |
|
||||
| `*ForRewardModeling`, `*RewardModel` | `embed` | `token_embed`, `embed` |
|
||||
| `*For*Classification`, `*ClassificationModel` | `classify` | `token_classify`, `classify`, `score` |
|
||||
|
||||
!!! tip
|
||||
You can explicitly set `--convert <type>` to specify how to convert the model.
|
||||
|
||||
### Pooling Tasks
|
||||
|
||||
Each pooling model in vLLM supports one or more of these tasks according to
|
||||
[Pooler.get_supported_tasks][vllm.model_executor.layers.pooler.Pooler.get_supported_tasks],
|
||||
enabling the corresponding APIs:
|
||||
|
||||
| Task | APIs |
|
||||
| ---------------- | ----------------------------------------------------------------------------- |
|
||||
| `embed` | `LLM.embed(...)`, `LLM.score(...)`\*, `LLM.encode(..., pooling_task="embed")` |
|
||||
| `classify` | `LLM.classify(...)`, `LLM.encode(..., pooling_task="classify")` |
|
||||
| `score` | `LLM.score(...)` |
|
||||
| `token_classify` | `LLM.reward(...)`, `LLM.encode(..., pooling_task="token_classify")` |
|
||||
| `token_embed` | `LLM.encode(..., pooling_task="token_embed")` |
|
||||
| `plugin` | `LLM.encode(..., pooling_task="plugin")` |
|
||||
|
||||
\* The `LLM.score(...)` API falls back to `embed` task if the model does not support `score` task.
|
||||
|
||||
### Pooler Configuration
|
||||
|
||||
#### Predefined models
|
||||
|
||||
If the [Pooler][vllm.model_executor.layers.pooler.Pooler] defined by the model accepts `pooler_config`,
|
||||
you can override some of its attributes via the `--pooler-config` option.
|
||||
|
||||
#### Converted models
|
||||
|
||||
If the model has been converted via `--convert` (see above),
|
||||
the pooler assigned to each task has the following attributes by default:
|
||||
|
||||
| Task | Pooling Type | Normalization | Softmax |
|
||||
| ---------- | ------------ | ------------- | ------- |
|
||||
| `embed` | `LAST` | ✅︎ | ❌ |
|
||||
| `classify` | `LAST` | ❌ | ✅︎ |
|
||||
|
||||
When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models,
|
||||
its Sentence Transformers configuration file (`modules.json`) takes priority over the model's defaults.
|
||||
|
||||
You can further customize this via the `--pooler-config` option,
|
||||
which takes priority over both the model's and Sentence Transformers' defaults.
|
||||
|
||||
## Offline Inference
|
||||
|
||||
The [LLM][vllm.LLM] class provides various methods for offline inference.
|
||||
See [configuration](../api/README.md#configuration) for a list of options when initializing the model.
|
||||
|
||||
### `LLM.embed`
|
||||
|
||||
The [embed][vllm.LLM.embed] method outputs an embedding vector for each prompt.
|
||||
It is primarily designed for embedding models.
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="intfloat/e5-small", runner="pooling")
|
||||
(output,) = llm.embed("Hello, my name is")
|
||||
|
||||
embeds = output.outputs.embedding
|
||||
print(f"Embeddings: {embeds!r} (size={len(embeds)})")
|
||||
```
|
||||
|
||||
A code example can be found here: [examples/basic/offline_inference/embed.py](../../examples/basic/offline_inference/embed.py)
|
||||
|
||||
### `LLM.classify`
|
||||
|
||||
The [classify][vllm.LLM.classify] method outputs a probability vector for each prompt.
|
||||
It is primarily designed for classification models.
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", runner="pooling")
|
||||
(output,) = llm.classify("Hello, my name is")
|
||||
|
||||
probs = output.outputs.probs
|
||||
print(f"Class Probabilities: {probs!r} (size={len(probs)})")
|
||||
```
|
||||
|
||||
A code example can be found here: [examples/basic/offline_inference/classify.py](../../examples/basic/offline_inference/classify.py)
|
||||
|
||||
### `LLM.score`
|
||||
|
||||
The [score][vllm.LLM.score] method outputs similarity scores between sentence pairs.
|
||||
It is designed for embedding models and cross-encoder models. Embedding models use cosine similarity, and [cross-encoder models](https://www.sbert.net/examples/applications/cross-encoder/README.html) serve as rerankers between candidate query-document pairs in RAG systems.
|
||||
|
||||
!!! note
|
||||
vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG.
|
||||
To handle RAG at a higher level, you should use integration frameworks such as [LangChain](https://github.com/langchain-ai/langchain).
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="BAAI/bge-reranker-v2-m3", runner="pooling")
|
||||
(output,) = llm.score(
|
||||
"What is the capital of France?",
|
||||
"The capital of Brazil is Brasilia.",
|
||||
)
|
||||
|
||||
score = output.outputs.score
|
||||
print(f"Score: {score}")
|
||||
```
|
||||
|
||||
A code example can be found here: [examples/basic/offline_inference/score.py](../../examples/basic/offline_inference/score.py)
|
||||
|
||||
### `LLM.reward`
|
||||
|
||||
The [reward][vllm.LLM.reward] method is available to all reward models in vLLM.
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="internlm/internlm2-1_8b-reward", runner="pooling", trust_remote_code=True)
|
||||
(output,) = llm.reward("Hello, my name is")
|
||||
|
||||
data = output.outputs.data
|
||||
print(f"Data: {data!r}")
|
||||
```
|
||||
|
||||
A code example can be found here: [examples/basic/offline_inference/reward.py](../../examples/basic/offline_inference/reward.py)
|
||||
|
||||
### `LLM.encode`
|
||||
|
||||
The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM.
|
||||
|
||||
!!! note
|
||||
Please use one of the more specific methods or set the task directly when using `LLM.encode`:
|
||||
|
||||
- For embeddings, use `LLM.embed(...)` or `pooling_task="embed"`.
|
||||
- For classification logits, use `LLM.classify(...)` or `pooling_task="classify"`.
|
||||
- For similarity scores, use `LLM.score(...)`.
|
||||
- For rewards, use `LLM.reward(...)` or `pooling_task="token_classify"`.
|
||||
- For token classification, use `pooling_task="token_classify"`.
|
||||
- For multi-vector retrieval, use `pooling_task="token_embed"`.
|
||||
- For IO Processor Plugins, use `pooling_task="plugin"`.
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="intfloat/e5-small", runner="pooling")
|
||||
(output,) = llm.encode("Hello, my name is", pooling_task="embed")
|
||||
|
||||
data = output.outputs.data
|
||||
print(f"Data: {data!r}")
|
||||
```
|
||||
|
||||
## Online Serving
|
||||
|
||||
Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs:
|
||||
|
||||
- [Embeddings API](../serving/openai_compatible_server.md#embeddings-api) is similar to `LLM.embed`, accepting both text and [multi-modal inputs](../features/multimodal_inputs.md) for embedding models.
|
||||
- [Classification API](../serving/openai_compatible_server.md#classification-api) is similar to `LLM.classify` and is applicable to sequence classification models.
|
||||
- [Score API](../serving/openai_compatible_server.md#score-api) is similar to `LLM.score` for cross-encoder models.
|
||||
- [Pooling API](../serving/openai_compatible_server.md#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models.
|
||||
|
||||
!!! note
|
||||
Please use one of the more specific endpoints or set the task directly when using the [Pooling API](../serving/openai_compatible_server.md#pooling-api):
|
||||
|
||||
- For embeddings, use [Embeddings API](../serving/openai_compatible_server.md#embeddings-api) or `"task":"embed"`.
|
||||
- For classification logits, use [Classification API](../serving/openai_compatible_server.md#classification-api) or `"task":"classify"`.
|
||||
- For similarity scores, use [Score API](../serving/openai_compatible_server.md#score-api).
|
||||
- For rewards, use `"task":"token_classify"`.
|
||||
- For token classification, use `"task":"token_classify"`.
|
||||
- For multi-vector retrieval, use `"task":"token_embed"`.
|
||||
- For IO Processor Plugins, use `"task":"plugin"`.
|
||||
|
||||
```python
|
||||
# start a supported embeddings model server with `vllm serve`, e.g.
|
||||
# vllm serve intfloat/e5-small
|
||||
import requests
|
||||
|
||||
host = "localhost"
|
||||
port = "8000"
|
||||
model_name = "intfloat/e5-small"
|
||||
|
||||
api_url = f"http://{host}:{port}/pooling"
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
prompt = {"model": model_name, "input": prompts, "task": "embed"}
|
||||
|
||||
response = requests.post(api_url, json=prompt)
|
||||
|
||||
for output in response.json()["data"]:
|
||||
data = output["data"]
|
||||
print(f"Data: {data!r} (size={len(data)})")
|
||||
```
|
||||
|
||||
## Matryoshka Embeddings
|
||||
|
||||
[Matryoshka Embeddings](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings) or [Matryoshka Representation Learning (MRL)](https://arxiv.org/abs/2205.13147) is a technique used in training embedding models. It allows users to trade off between performance and cost.
|
||||
|
||||
!!! warning
|
||||
Not all embedding models are trained using Matryoshka Representation Learning. To avoid misuse of the `dimensions` parameter, vLLM returns an error for requests that attempt to change the output dimension of models that do not support Matryoshka Embeddings.
|
||||
|
||||
For example, setting `dimensions` parameter while using the `BAAI/bge-m3` model will result in the following error.
|
||||
|
||||
```json
|
||||
{"object":"error","message":"Model \"BAAI/bge-m3\" does not support matryoshka representation, changing output dimensions will lead to poor results.","type":"BadRequestError","param":null,"code":400}
|
||||
```
|
||||
|
||||
### Manually enable Matryoshka Embeddings
|
||||
|
||||
There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, if `is_matryoshka` is `True` in `config.json`, you can change the output dimension to arbitrary values. Use `matryoshka_dimensions` to control the allowed output dimensions.
|
||||
|
||||
For models that support Matryoshka Embeddings but are not recognized by vLLM, manually override the config using `hf_overrides={"is_matryoshka": True}` or `hf_overrides={"matryoshka_dimensions": [<allowed output dimensions>]}` (offline), or `--hf-overrides '{"is_matryoshka": true}'` or `--hf-overrides '{"matryoshka_dimensions": [<allowed output dimensions>]}'` (online).
|
||||
|
||||
Here is an example to serve a model with Matryoshka Embeddings enabled.
|
||||
|
||||
```bash
|
||||
vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf-overrides '{"matryoshka_dimensions":[256]}'
|
||||
```
|
||||
|
||||
### Offline Inference
|
||||
|
||||
You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter in [PoolingParams][vllm.PoolingParams].
|
||||
|
||||
```python
|
||||
from vllm import LLM, PoolingParams
|
||||
|
||||
llm = LLM(
|
||||
model="jinaai/jina-embeddings-v3",
|
||||
runner="pooling",
|
||||
trust_remote_code=True,
|
||||
)
|
||||
outputs = llm.embed(
|
||||
["Follow the white rabbit."],
|
||||
pooling_params=PoolingParams(dimensions=32),
|
||||
)
|
||||
print(outputs[0].outputs)
|
||||
```
|
||||
|
||||
A code example can be found here: [examples/pooling/embed/embed_matryoshka_fy_offline.py](../../examples/pooling/embed/embed_matryoshka_fy_offline.py)
|
||||
|
||||
### Online Inference
|
||||
|
||||
Use the following command to start the vLLM server.
|
||||
|
||||
```bash
|
||||
vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
|
||||
```
|
||||
|
||||
You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter.
|
||||
|
||||
```bash
|
||||
curl http://127.0.0.1:8000/v1/embeddings \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"input": "Follow the white rabbit.",
|
||||
"model": "jinaai/jina-embeddings-v3",
|
||||
"encoding_format": "float",
|
||||
"dimensions": 32
|
||||
}'
|
||||
```
|
||||
|
||||
Expected output:
|
||||
|
||||
```json
|
||||
{"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}}
|
||||
```
|
||||
|
||||
An OpenAI client example can be found here: [examples/pooling/embed/openai_embedding_matryoshka_fy_client.py](../../examples/pooling/embed/openai_embedding_matryoshka_fy_client.py)
|
||||
|
||||
## Specific models
|
||||
|
||||
### ColBERT Late Interaction Models
|
||||
|
||||
[ColBERT](https://arxiv.org/abs/2004.12832) (Contextualized Late Interaction over BERT) is a retrieval model that uses per-token embeddings and MaxSim scoring for document ranking. Unlike single-vector embedding models, ColBERT retains token-level representations and computes relevance scores through late interaction, providing better accuracy while being more efficient than cross-encoders.
|
||||
|
||||
vLLM supports ColBERT models with multiple encoder backbones:
|
||||
|
||||
| Architecture | Backbone | Example HF Models |
|
||||
| - | - | - |
|
||||
| `HF_ColBERT` | BERT | `answerdotai/answerai-colbert-small-v1`, `colbert-ir/colbertv2.0` |
|
||||
| `ColBERTModernBertModel` | ModernBERT | `lightonai/GTE-ModernColBERT-v1` |
|
||||
| `ColBERTJinaRobertaModel` | Jina XLM-RoBERTa | `jinaai/jina-colbert-v2` |
|
||||
|
||||
**BERT-based ColBERT** models work out of the box:
|
||||
|
||||
```shell
|
||||
vllm serve answerdotai/answerai-colbert-small-v1
|
||||
```
|
||||
|
||||
For **non-BERT backbones**, use `--hf-overrides` to set the correct architecture:
|
||||
|
||||
```shell
|
||||
# ModernBERT backbone
|
||||
vllm serve lightonai/GTE-ModernColBERT-v1 \
|
||||
--hf-overrides '{"architectures": ["ColBERTModernBertModel"]}'
|
||||
|
||||
# Jina XLM-RoBERTa backbone
|
||||
vllm serve jinaai/jina-colbert-v2 \
|
||||
--hf-overrides '{"architectures": ["ColBERTJinaRobertaModel"]}' \
|
||||
--trust-remote-code
|
||||
```
|
||||
|
||||
Then you can use the rerank endpoint:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{
|
||||
"model": "answerdotai/answerai-colbert-small-v1",
|
||||
"query": "What is machine learning?",
|
||||
"documents": [
|
||||
"Machine learning is a subset of artificial intelligence.",
|
||||
"Python is a programming language.",
|
||||
"Deep learning uses neural networks."
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
Or the score endpoint:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{
|
||||
"model": "answerdotai/answerai-colbert-small-v1",
|
||||
"text_1": "What is machine learning?",
|
||||
"text_2": ["Machine learning is a subset of AI.", "The weather is sunny."]
|
||||
}'
|
||||
```
|
||||
|
||||
You can also get the raw token embeddings using the pooling endpoint with `token_embed` task:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
|
||||
"model": "answerdotai/answerai-colbert-small-v1",
|
||||
"input": "What is machine learning?",
|
||||
"task": "token_embed"
|
||||
}'
|
||||
```
|
||||
|
||||
An example can be found here: [examples/pooling/score/colbert_rerank_online.py](../../examples/pooling/score/colbert_rerank_online.py)
|
||||
|
||||
### ColQwen3 Multi-Modal Late Interaction Models
|
||||
|
||||
ColQwen3 is based on [ColPali](https://arxiv.org/abs/2407.01449), which extends ColBERT's late interaction approach to **multi-modal** inputs. While ColBERT operates on text-only token embeddings, ColPali/ColQwen3 can embed both **text and images** (e.g. PDF pages, screenshots, diagrams) into per-token L2-normalized vectors and compute relevance via MaxSim scoring. ColQwen3 specifically uses Qwen3-VL as its vision-language backbone.
|
||||
|
||||
| Architecture | Backbone | Example HF Models |
|
||||
| - | - | - |
|
||||
| `ColQwen3` | Qwen3-VL | `TomoroAI/tomoro-colqwen3-embed-4b`, `TomoroAI/tomoro-colqwen3-embed-8b` |
|
||||
| `OpsColQwen3Model` | Qwen3-VL | `OpenSearch-AI/Ops-Colqwen3-4B`, `OpenSearch-AI/Ops-Colqwen3-8B` |
|
||||
| `Qwen3VLNemotronEmbedModel` | Qwen3-VL | `nvidia/nemotron-colembed-vl-4b-v2`, `nvidia/nemotron-colembed-vl-8b-v2` |
|
||||
|
||||
Start the server:
|
||||
|
||||
```shell
|
||||
vllm serve TomoroAI/tomoro-colqwen3-embed-4b --max-model-len 4096
|
||||
```
|
||||
|
||||
#### Text-only scoring and reranking
|
||||
|
||||
Use the `/rerank` endpoint:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{
|
||||
"model": "TomoroAI/tomoro-colqwen3-embed-4b",
|
||||
"query": "What is machine learning?",
|
||||
"documents": [
|
||||
"Machine learning is a subset of artificial intelligence.",
|
||||
"Python is a programming language.",
|
||||
"Deep learning uses neural networks."
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
Or the `/score` endpoint:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{
|
||||
"model": "TomoroAI/tomoro-colqwen3-embed-4b",
|
||||
"text_1": "What is the capital of France?",
|
||||
"text_2": ["The capital of France is Paris.", "Python is a programming language."]
|
||||
}'
|
||||
```
|
||||
|
||||
#### Multi-modal scoring and reranking (text query × image documents)
|
||||
|
||||
The `/score` and `/rerank` endpoints also accept multi-modal inputs directly.
|
||||
Pass image documents using the `data_1`/`data_2` (for `/score`) or `documents` (for `/rerank`) fields
|
||||
with a `content` list containing `image_url` and `text` parts — the same format used by the
|
||||
OpenAI chat completion API:
|
||||
|
||||
Score a text query against image documents:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{
|
||||
"model": "TomoroAI/tomoro-colqwen3-embed-4b",
|
||||
"data_1": "Retrieve the city of Beijing",
|
||||
"data_2": [
|
||||
{
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64>"}},
|
||||
{"type": "text", "text": "Describe the image."}
|
||||
]
|
||||
}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
Rerank image documents by a text query:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{
|
||||
"model": "TomoroAI/tomoro-colqwen3-embed-4b",
|
||||
"query": "Retrieve the city of Beijing",
|
||||
"documents": [
|
||||
{
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64_1>"}},
|
||||
{"type": "text", "text": "Describe the image."}
|
||||
]
|
||||
},
|
||||
{
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64_2>"}},
|
||||
{"type": "text", "text": "Describe the image."}
|
||||
]
|
||||
}
|
||||
],
|
||||
"top_n": 2
|
||||
}'
|
||||
```
|
||||
|
||||
#### Raw token embeddings
|
||||
|
||||
You can also get the raw token embeddings using the `/pooling` endpoint with `token_embed` task:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
|
||||
"model": "TomoroAI/tomoro-colqwen3-embed-4b",
|
||||
"input": "What is machine learning?",
|
||||
"task": "token_embed"
|
||||
}'
|
||||
```
|
||||
|
||||
For **image inputs** via the pooling endpoint, use the chat-style `messages` field:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
|
||||
"model": "TomoroAI/tomoro-colqwen3-embed-4b",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64>"}},
|
||||
{"type": "text", "text": "Describe the image."}
|
||||
]
|
||||
}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
#### Examples
|
||||
|
||||
- Multi-vector retrieval: [examples/pooling/token_embed/colqwen3_token_embed_online.py](../../examples/pooling/token_embed/colqwen3_token_embed_online.py)
|
||||
- Reranking (text + multi-modal): [examples/pooling/score/colqwen3_rerank_online.py](../../examples/pooling/score/colqwen3_rerank_online.py)
|
||||
|
||||
### Llama Nemotron Multimodal
|
||||
|
||||
#### Embedding Model
|
||||
|
||||
Llama Nemotron VL Embedding models combine the bidirectional Llama embedding backbone
|
||||
(from `nvidia/llama-nemotron-embed-1b-v2`) with SigLIP as the vision encoder to produce
|
||||
single-vector embeddings from text and/or images.
|
||||
|
||||
| Architecture | Backbone | Example HF Models |
|
||||
| - | - | - |
|
||||
| `LlamaNemotronVLModel` | Bidirectional Llama + SigLIP | `nvidia/llama-nemotron-embed-vl-1b-v2` |
|
||||
|
||||
Start the server:
|
||||
|
||||
```shell
|
||||
vllm serve nvidia/llama-nemotron-embed-vl-1b-v2 \
|
||||
--trust-remote-code \
|
||||
--chat-template examples/pooling/embed/template/nemotron_embed_vl.jinja
|
||||
```
|
||||
|
||||
!!! note
|
||||
The chat template bundled with this model's tokenizer is not suitable for
|
||||
the embeddings API. Use the provided override template above when serving
|
||||
with the `messages`-based (chat-style) embeddings endpoint.
|
||||
|
||||
The override template uses the message `role` to automatically prepend the
|
||||
appropriate prefix: set `role` to `"query"` for queries (prepends `query: `)
|
||||
or `"document"` for passages (prepends `passage: `). Any other role omits
|
||||
the prefix.
|
||||
|
||||
Embed text queries:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/v1/embeddings -H "Content-Type: application/json" -d '{
|
||||
"model": "nvidia/llama-nemotron-embed-vl-1b-v2",
|
||||
"messages": [
|
||||
{
|
||||
"role": "query",
|
||||
"content": [
|
||||
{"type": "text", "text": "What is machine learning?"}
|
||||
]
|
||||
}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
Embed images via the chat-style `messages` field:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/v1/embeddings -H "Content-Type: application/json" -d '{
|
||||
"model": "nvidia/llama-nemotron-embed-vl-1b-v2",
|
||||
"messages": [
|
||||
{
|
||||
"role": "document",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64>"}},
|
||||
{"type": "text", "text": "Describe the image."}
|
||||
]
|
||||
}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
#### Reranker Model
|
||||
|
||||
Llama Nemotron VL reranker models combine the same bidirectional Llama + SigLIP
|
||||
backbone with a sequence-classification head for cross-encoder scoring and reranking.
|
||||
|
||||
| Architecture | Backbone | Example HF Models |
|
||||
| - | - | - |
|
||||
| `LlamaNemotronVLForSequenceClassification` | Bidirectional Llama + SigLIP | `nvidia/llama-nemotron-rerank-vl-1b-v2` |
|
||||
|
||||
Start the server:
|
||||
|
||||
```shell
|
||||
vllm serve nvidia/llama-nemotron-rerank-vl-1b-v2 \
|
||||
--runner pooling \
|
||||
--trust-remote-code \
|
||||
--chat-template examples/pooling/score/template/nemotron-vl-rerank.jinja
|
||||
```
|
||||
|
||||
!!! note
|
||||
The chat template bundled with this checkpoint's tokenizer is not suitable
|
||||
for the Score/Rerank APIs. Use the provided override template when serving:
|
||||
`examples/pooling/score/template/nemotron-vl-rerank.jinja`.
|
||||
|
||||
Score a text query against an image document:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{
|
||||
"model": "nvidia/llama-nemotron-rerank-vl-1b-v2",
|
||||
"data_1": "Find diagrams about autonomous robots",
|
||||
"data_2": [
|
||||
{
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64>"}},
|
||||
{"type": "text", "text": "Robotics workflow diagram."}
|
||||
]
|
||||
}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
Rerank image documents by a text query:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{
|
||||
"model": "nvidia/llama-nemotron-rerank-vl-1b-v2",
|
||||
"query": "Find diagrams about autonomous robots",
|
||||
"documents": [
|
||||
{
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64_1>"}},
|
||||
{"type": "text", "text": "Robotics workflow diagram."}
|
||||
]
|
||||
},
|
||||
{
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64_2>"}},
|
||||
{"type": "text", "text": "General skyline photo."}
|
||||
]
|
||||
}
|
||||
],
|
||||
"top_n": 2
|
||||
}'
|
||||
```
|
||||
|
||||
### BAAI/bge-m3
|
||||
|
||||
The `BAAI/bge-m3` model comes with extra weights for sparse and colbert embeddings but unfortunately in its `config.json`
|
||||
the architecture is declared as `XLMRobertaModel`, which makes `vLLM` load it as a vanilla ROBERTA model without the
|
||||
extra weights. To load the full model weights, override its architecture like this:
|
||||
|
||||
```shell
|
||||
vllm serve BAAI/bge-m3 --hf-overrides '{"architectures": ["BgeM3EmbeddingModel"]}'
|
||||
```
|
||||
|
||||
Then you obtain the sparse embeddings like this:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
|
||||
"model": "BAAI/bge-m3",
|
||||
"task": "token_classify",
|
||||
"input": ["What is BGE M3?", "Definition of BM25"]
|
||||
}'
|
||||
```
|
||||
|
||||
Due to limitations in the output schema, the output consists of a list of
|
||||
token scores for each token for each input. This means that you'll have to call
|
||||
`/tokenize` as well to be able to pair tokens with scores.
|
||||
Refer to the tests in `tests/models/language/pooling/test_bge_m3.py` to see how
|
||||
to do that.
|
||||
|
||||
You can obtain the colbert embeddings like this:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
|
||||
"model": "BAAI/bge-m3",
|
||||
"task": "token_embed",
|
||||
"input": ["What is BGE M3?", "Definition of BM25"]
|
||||
}'
|
||||
```
|
||||
|
||||
## Deprecated Features
|
||||
|
||||
### Encode task
|
||||
|
||||
We have split the `encode` task into two more specific token-wise tasks: `token_embed` and `token_classify`:
|
||||
|
||||
- `token_embed` is the same as `embed`, using normalization as the activation.
|
||||
- `token_classify` is the same as `classify`, by default using softmax as the activation.
|
||||
|
||||
Pooling models now default support all pooling, you can use it without any settings.
|
||||
|
||||
- Extracting hidden states prefers using `token_embed` task.
|
||||
- Reward models prefers using `token_classify` task.
|
||||
260
docs/models/pooling_models/README.md
Normal file
260
docs/models/pooling_models/README.md
Normal file
@@ -0,0 +1,260 @@
|
||||
# Pooling Models
|
||||
|
||||
!!! note
|
||||
We currently support pooling models primarily for convenience. This is not guaranteed to provide any performance improvements over using Hugging Face Transformers or Sentence Transformers directly.
|
||||
|
||||
We plan to optimize pooling models in vLLM. Please comment on <https://github.com/vllm-project/vllm/issues/21796> if you have any suggestions!
|
||||
|
||||
## What are pooling models?
|
||||
|
||||
Natural Language Processing (NLP) can be primarily divided into the following two types of tasks:
|
||||
|
||||
- Natural Language Understanding (NLU)
|
||||
- Natural Language Generation (NLG)
|
||||
|
||||
The generative models supported by vLLM cover a variety of task types, such as the large language models (LLMs) we are familiar with, multimodal models (VLM) that handle multimodal inputs like images, videos, and audio, speech-to-text transcription models, and real-time models that support streaming input. Their common feature is the ability to generate text. Taking it a step further, vLLM-Omni supports the generation of multimodal content, including images, videos, and audio.
|
||||
|
||||
As the capabilities of generative models continue to improve, the boundaries of these models are also constantly expanding. However, certain application scenarios still require specialized small language models to efficiently complete specific tasks. These models typically have the following characteristics:
|
||||
|
||||
- They do not require content generation.
|
||||
- They only need to perform very limited functions, without requiring strong generalization, creativity, or high intelligence.
|
||||
- They demand extremely low latency and may operate on cost-constrained hardware.
|
||||
- Text-only models typically have fewer than 1 billion parameters, while multimodal models generally have fewer than 10 billion parameters.
|
||||
|
||||
Although these models are relatively small in scale, they are still based on the Transformer architecture, similar or even identical to the most advanced large language models today. Many recently released pooling models are also fine-tuned from large language models, allowing them to benefit from the continuous improvements in large models. This architecture similarity enables them to reuse much of vLLM’s infrastructure. If compatible, we would be happy to help them leverage the latest features of vLLM as well.
|
||||
|
||||
### Sequence-wise Task and Token-wise Task
|
||||
|
||||
The key distinction between sequence-wise task and token-wise task lies in their output granularity: sequence-wise task produces a single result for an entire input sequence, whereas token-wise task yields a result for each individual token within the sequence.
|
||||
|
||||
Of course, we also have "plugin" tasks that allow users to customize input and output processors. For more information, please refer to [IO Processor Plugins](../../design/io_processor_plugins.md).
|
||||
|
||||
### Pooling Tasks
|
||||
|
||||
| Pooling Tasks | Granularity | Outputs |
|
||||
|-----------------------|---------------|-------------------------------------------------|
|
||||
| `classify` (see note) | Sequence-wise | probability vector of classes for each sequence |
|
||||
| `embed` | Sequence-wise | vector representations for each sequence |
|
||||
| `token_classify` | Token-wise | probability vector of classes for each token |
|
||||
| `token_embed` | Token-wise | vector representations for each token |
|
||||
|
||||
!!! note
|
||||
Within classification tasks, there is a specialized subcategory: Cross-encoder (aka reranker) models. These models are a subset of classification models that accept two prompts as input and output num_labels equal to 1.
|
||||
|
||||
### Score Types
|
||||
|
||||
The scoring models is designed to compute similarity scores between two input prompts. It supports three model types (aka `score_type`): `cross-encoder`, `late-interaction`, and `bi-encoder`.
|
||||
|
||||
| Pooling Tasks | Granularity | Outputs | Score Types | scoring function |
|
||||
|-----------------------|---------------|----------------------------------------------|--------------------|--------------------------|
|
||||
| `classify` (see note) | Sequence-wise | reranker score for each sequence | `cross-encoder` | linear classifier |
|
||||
| `embed` | Sequence-wise | vector representations for each sequence | `bi-encoder` | cosine similarity |
|
||||
| `token_classify` | Token-wise | probability vector of classes for each token | nan | nan |
|
||||
| `token_embed` | Token-wise | vector representations for each token | `late-interaction` | late interaction(MaxSim) |
|
||||
|
||||
!!! note
|
||||
Only when a classification model outputs num_labels equal to 1 can it be used as a scoring model and have its scoring API enabled.
|
||||
|
||||
### Pooling Usages
|
||||
|
||||
| Pooling Usages | Description |
|
||||
|-----------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| Classification Usages | Predicting which predefined category, class, or label best corresponds to a given input. |
|
||||
| Embedding Usages | Converts unstructured data (text, images, audio, etc.) into structured numerical vectors (embeddings). |
|
||||
| Token Classification Usages | Token-wise classification |
|
||||
| Token Embedding Usages | Token-wise embedding |
|
||||
| Scoring Usages | Computes similarity scores between two inputs. It supports three model types (aka `score_type`): `cross-encoder`, `late-interaction`, and `bi-encoder`. |
|
||||
| Reward Usages | Evaluates the quality of outputs generated by a language model, acting as a proxy for human preferences. |
|
||||
|
||||
We also have some special models that support multiple pooling tasks, or have specific usage scenarios, or support special inputs and outputs.
|
||||
|
||||
For more detailed information, please refer to the link below.
|
||||
|
||||
- [Classification Usages](classify.md)
|
||||
- [Embedding Usages](embed.md)
|
||||
- [Reward Usages](reward.md)
|
||||
- [Token Classification Usages](token_classify.md)
|
||||
- [Token Embedding Usages](token_embed.md)
|
||||
- [Scoring Usages](scoring.md)
|
||||
- [Specific Model Examples](specific_models.md)
|
||||
|
||||
## Offline Inference
|
||||
|
||||
Each pooling model in vLLM supports one or more of these tasks according to
|
||||
[Pooler.get_supported_tasks][vllm.model_executor.layers.pooler.Pooler.get_supported_tasks],
|
||||
enabling the corresponding APIs.
|
||||
|
||||
### Offline APIs corresponding to pooling tasks
|
||||
|
||||
| Task | APIs |
|
||||
|------------------|---------------------------------------------------------------------------------------|
|
||||
| `embed` | `LLM.embed(...)`, `LLM.encode(..., pooling_task="embed")`, `LLM.score(...)`(see note) |
|
||||
| `classify` | `LLM.classify(...)`, `LLM.encode(..., pooling_task="classify")`, `LLM.score(...)` |
|
||||
| `token_classify` | `LLM.reward(...)`, `LLM.encode(..., pooling_task="token_classify")` |
|
||||
| `token_embed` | `LLM.encode(..., pooling_task="token_embed")`, `LLM.score(...)` |
|
||||
| `plugin` | `LLM.encode(..., pooling_task="plugin")` |
|
||||
|
||||
!!! note
|
||||
Only when a classification model outputs num_labels equal to 1 can it be used as a scoring model and have its scoring API enabled.
|
||||
|
||||
### `LLM.classify`
|
||||
|
||||
The [classify][vllm.LLM.classify] method outputs a probability vector for each prompt.
|
||||
It is primarily designed for [classification models](classify.md).
|
||||
For more information about `LLM.embed`, see [this page](classify.md#offline-inference).
|
||||
|
||||
### `LLM.embed`
|
||||
|
||||
The [embed][vllm.LLM.embed] method outputs an embedding vector for each prompt.
|
||||
It is primarily designed for [embedding models](embed.md).
|
||||
For more information about `LLM.embed`, see [this page](embed.md#offline-inference).
|
||||
|
||||
### `LLM.score`
|
||||
|
||||
The [score][vllm.LLM.score] method outputs similarity scores between sentence pairs.
|
||||
It is primarily designed for [score models](scoring.md).
|
||||
|
||||
### `LLM.encode`
|
||||
|
||||
The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM.
|
||||
|
||||
Please use one of the more specific methods or set the task directly when using `LLM.encode`, refer to the [table above](#offline-apis-corresponding-to-pooling-tasks).
|
||||
|
||||
### Examples
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="intfloat/e5-small", runner="pooling")
|
||||
(output,) = llm.encode("Hello, my name is", pooling_task="embed")
|
||||
|
||||
data = output.outputs.data
|
||||
print(f"Data: {data!r}")
|
||||
```
|
||||
|
||||
## Online Serving
|
||||
|
||||
Our online Server provides endpoints that correspond to the offline APIs:
|
||||
|
||||
- Corresponding to `LLM.embed`:
|
||||
- [Cohere Embed API](embed.md#cohere-embed-api) (`/v2/embed`)
|
||||
- [Openai-compatible Embeddings API](embed.md#openai-compatible-embeddings-api) (`/v1/embeddings`)
|
||||
- Corresponding to `LLM.classify`:
|
||||
- [Classification API](classify.md#online-serving)(`/classify`)
|
||||
- Corresponding to `LLM.score`:
|
||||
- [Score API](scoring.md#score-api)(`/score`)
|
||||
- [Rerank API](scoring.md#rerank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`)
|
||||
- Pooling API (`/pooling`) is similar to `LLM.encode`, being applicable to all types of pooling models.
|
||||
|
||||
The following introduces the Pooling API. For other APIs, please refer to the link above.
|
||||
|
||||
### Pooling API
|
||||
|
||||
Our Pooling API (`/pooling`) is similar to `LLM.encode`, being applicable to all types of pooling models.
|
||||
|
||||
The input format is the same as [Embeddings API](embed.md#openai-compatible-embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats.
|
||||
|
||||
Please use one of the more specific APIs or set the task directly when using the Pooling API, refer to the [table above](#offline-apis-corresponding-to-pooling-tasks).
|
||||
|
||||
Code example: [examples/pooling/pooling/pooling_online.py](../../../examples/pooling/pooling/pooling_online.py)
|
||||
|
||||
### Examples
|
||||
|
||||
```python
|
||||
# start a supported embeddings model server with `vllm serve`, e.g.
|
||||
# vllm serve intfloat/e5-small
|
||||
import requests
|
||||
|
||||
host = "localhost"
|
||||
port = "8000"
|
||||
model_name = "intfloat/e5-small"
|
||||
|
||||
api_url = f"http://{host}:{port}/pooling"
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
prompt = {"model": model_name, "input": prompts, "task": "embed"}
|
||||
|
||||
response = requests.post(api_url, json=prompt)
|
||||
|
||||
for output in response.json()["data"]:
|
||||
data = output["data"]
|
||||
print(f"Data: {data!r} (size={len(data)})")
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
In vLLM, pooling models implement the [VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface.
|
||||
These models use a [Pooler][vllm.model_executor.layers.pooler.Pooler] to extract the final hidden states of the input
|
||||
before returning them.
|
||||
|
||||
### Model Runner
|
||||
|
||||
Run a model in pooling mode via the option `--runner pooling`.
|
||||
|
||||
!!! tip
|
||||
There is no need to set this option in the vast majority of cases as vLLM can automatically
|
||||
detect the appropriate model runner via `--runner auto`.
|
||||
|
||||
### Model Conversion
|
||||
|
||||
vLLM can adapt models for various pooling tasks via the option `--convert <type>`.
|
||||
|
||||
If `--runner pooling` has been set (manually or automatically) but the model does not implement the
|
||||
[VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface,
|
||||
vLLM will attempt to automatically convert the model according to the architecture names
|
||||
shown in the table below.
|
||||
|
||||
| Architecture | `--convert` | Supported pooling tasks |
|
||||
|-------------------------------------------------|-------------|------------------------------|
|
||||
| `*ForTextEncoding`, `*EmbeddingModel`, `*Model` | `embed` | `token_embed`, `embed` |
|
||||
| `*ForRewardModeling`, `*RewardModel` | `embed` | `token_embed`, `embed` |
|
||||
| `*For*Classification`, `*ClassificationModel` | `classify` | `token_classify`, `classify` |
|
||||
|
||||
!!! tip
|
||||
You can explicitly set `--convert <type>` to specify how to convert the model.
|
||||
|
||||
### Pooler Configuration
|
||||
|
||||
#### Predefined models
|
||||
|
||||
If the [Pooler][vllm.model_executor.layers.pooler.Pooler] defined by the model accepts `pooler_config`,
|
||||
you can override some of its attributes via the `--pooler-config` option.
|
||||
|
||||
#### Converted models
|
||||
|
||||
If the model has been converted via `--convert` (see above),
|
||||
the pooler assigned to each task has the following attributes by default:
|
||||
|
||||
| Task | Pooling Type | Normalization | Softmax |
|
||||
| ---------- | ------------ | ------------- | ------- |
|
||||
| `embed` | `LAST` | ✅︎ | ❌ |
|
||||
| `classify` | `LAST` | ❌ | ✅︎ |
|
||||
|
||||
When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models,
|
||||
its Sentence Transformers configuration file (`modules.json`) takes priority over the model's defaults.
|
||||
|
||||
You can further customize this via the `--pooler-config` option,
|
||||
which takes priority over both the model's and Sentence Transformers' defaults.
|
||||
|
||||
## Removed Features
|
||||
|
||||
### Encode task
|
||||
|
||||
We have split the `encode` task into two more specific token-wise tasks: `token_embed` and `token_classify`:
|
||||
|
||||
- `token_embed` is the same as `embed`, using normalization as the activation.
|
||||
- `token_classify` is the same as `classify`, by default using softmax as the activation.
|
||||
|
||||
Pooling models now default support all pooling, you can use it without any settings.
|
||||
|
||||
- Extracting hidden states prefers using `token_embed` task.
|
||||
- Named Entity Recognition (NER) and reward models prefers using `token_classify` task.
|
||||
|
||||
### Score task
|
||||
|
||||
`score` task is deprecated and will be removed in v0.20. Please use `classify` instead. Only when a classification model outputs num_labels equal to 1 can it be used as a scoring model and have its scoring API enabled.
|
||||
278
docs/models/pooling_models/classify.md
Normal file
278
docs/models/pooling_models/classify.md
Normal file
@@ -0,0 +1,278 @@
|
||||
# Classification Usages
|
||||
|
||||
Classification involves predicting which predefined category, class, or label best corresponds to a given input.
|
||||
|
||||
## Summary
|
||||
|
||||
- Model Usage: (sequence) classification
|
||||
- Pooling Task: `classify`
|
||||
- Offline APIs:
|
||||
- `LLM.classify(...)`
|
||||
- `LLM.encode(..., pooling_task="classify")`
|
||||
- Online APIs:
|
||||
- [Classification API](classify.md#online-serving) (`/classify`)
|
||||
- Pooling API (`/pooling`)
|
||||
|
||||
The key distinction between (sequence) classification and token classification lies in their output granularity: (sequence) classification produces a single result for an entire input sequence, whereas token classification yields a result for each individual token within the sequence.
|
||||
|
||||
Many classification models support both (sequence) classification and token classification. For further details on token classification, please refer to [this page](token_classify.md).
|
||||
|
||||
Only when a classification model outputs num_labels equal to 1 can it be used as a scoring model and have its scoring API enabled, please refer to [this page](scoring.md).
|
||||
|
||||
## Typical Use Cases
|
||||
|
||||
### Classification
|
||||
|
||||
The most fundamental application of classification models is to categorize input data into predefined classes.
|
||||
|
||||
## Supported Models
|
||||
|
||||
### Text-only Models
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) |
|
||||
| ------------ | ------ | ----------------- | ------------------------------ | ------------------------------------------ |
|
||||
| `ErnieForSequenceClassification` | BERT-like Chinese ERNIE | `Forrest20231206/ernie-3.0-base-zh-cls` | | |
|
||||
| `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | |
|
||||
| `Qwen2ForSequenceClassification`<sup>C</sup> | Qwen2-based | `jason9693/Qwen2.5-1.5B-apeach` | | |
|
||||
| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |
|
||||
|
||||
### Multimodal Models
|
||||
|
||||
!!! note
|
||||
For more information about multimodal models inputs, see [this page](../supported_models.md#list-of-multimodal-language-models).
|
||||
|
||||
| Architecture | Models | Inputs | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) |
|
||||
| ------------ | ------ | ------ | ----------------- | ------------------------------ | ------------------------------------------ |
|
||||
| `Qwen2_5_VLForSequenceClassification`<sup>C</sup> | Qwen2_5_VL-based | T + I<sup>E+</sup> + V<sup>E+</sup> | `muziyongshixin/Qwen2.5-VL-7B-for-VideoCls` | | |
|
||||
| `*ForConditionalGeneration`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | \* | N/A | \* | \* |
|
||||
|
||||
<sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./README.md#model-conversion))
|
||||
\* Feature support is the same as that of the original model.
|
||||
|
||||
If your model is not in the above list, we will try to automatically convert the model using
|
||||
[as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
|
||||
|
||||
### Cross-encoder Models
|
||||
|
||||
Cross-encoder (aka reranker) models are a subset of classification models that accept two prompts as input and output num_labels equal to 1. Most classification models can also be used as [cross-encoder models](scoring.md#cross-encoder-models). For more information on cross-encoder models, please refer to [this page](scoring.md).
|
||||
|
||||
--8<-- "docs/models/pooling_models/scoring.md:supported-cross-encoder-models"
|
||||
|
||||
### Reward Models
|
||||
|
||||
Using (sequence) classification models as reward models. For more information, see [Reward Models](reward.md).
|
||||
|
||||
--8<-- "docs/models/pooling_models/reward.md:supported-sequence-reward-models"
|
||||
|
||||
## Offline Inference
|
||||
|
||||
### Pooling Parameters
|
||||
|
||||
The following [pooling parameters][vllm.PoolingParams] are supported.
|
||||
|
||||
```python
|
||||
--8<-- "vllm/pooling_params.py:common-pooling-params"
|
||||
--8<-- "vllm/pooling_params.py:classify-pooling-params"
|
||||
```
|
||||
|
||||
### `LLM.classify`
|
||||
|
||||
The [classify][vllm.LLM.classify] method outputs a probability vector for each prompt.
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", runner="pooling")
|
||||
(output,) = llm.classify("Hello, my name is")
|
||||
|
||||
probs = output.outputs.probs
|
||||
print(f"Class Probabilities: {probs!r} (size={len(probs)})")
|
||||
```
|
||||
|
||||
A code example can be found here: [examples/offline_inference/basic/classify.py](../../../examples/basic/offline_inference/classify.py)
|
||||
|
||||
### `LLM.encode`
|
||||
|
||||
The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM.
|
||||
|
||||
Set `pooling_task="classify"` when using `LLM.encode` for classification Models:
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", runner="pooling")
|
||||
(output,) = llm.encode("Hello, my name is", pooling_task="classify")
|
||||
|
||||
data = output.outputs.data
|
||||
print(f"Data: {data!r}")
|
||||
```
|
||||
|
||||
## Online Serving
|
||||
|
||||
### Classification API
|
||||
|
||||
Online `/classify` API is similar to `LLM.classify`.
|
||||
|
||||
#### Completion Parameters
|
||||
|
||||
The following Classification API parameters are supported:
|
||||
|
||||
??? code
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:completion-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-params"
|
||||
```
|
||||
|
||||
The following extra parameters are supported:
|
||||
|
||||
??? code
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:completion-extra-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params"
|
||||
```
|
||||
|
||||
#### Chat Parameters
|
||||
|
||||
For chat-like input (i.e. if `messages` is passed), the following parameters are supported:
|
||||
|
||||
??? code
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:chat-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-params"
|
||||
```
|
||||
|
||||
these extra parameters are supported instead:
|
||||
|
||||
??? code
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:chat-extra-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params"
|
||||
```
|
||||
|
||||
#### Example Requests
|
||||
|
||||
Code example: [examples/pooling/classify/classification_online.py](../../../examples/pooling/classify/classification_online.py)
|
||||
|
||||
You can classify multiple texts by passing an array of strings:
|
||||
|
||||
```bash
|
||||
curl -v "http://127.0.0.1:8000/classify" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "jason9693/Qwen2.5-1.5B-apeach",
|
||||
"input": [
|
||||
"Loved the new café—coffee was great.",
|
||||
"This update broke everything. Frustrating."
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
??? console "Response"
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "classify-7c87cac407b749a6935d8c7ce2a8fba2",
|
||||
"object": "list",
|
||||
"created": 1745383065,
|
||||
"model": "jason9693/Qwen2.5-1.5B-apeach",
|
||||
"data": [
|
||||
{
|
||||
"index": 0,
|
||||
"label": "Default",
|
||||
"probs": [
|
||||
0.565970778465271,
|
||||
0.4340292513370514
|
||||
],
|
||||
"num_classes": 2
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"label": "Spoiled",
|
||||
"probs": [
|
||||
0.26448777318000793,
|
||||
0.7355121970176697
|
||||
],
|
||||
"num_classes": 2
|
||||
}
|
||||
],
|
||||
"usage": {
|
||||
"prompt_tokens": 20,
|
||||
"total_tokens": 20,
|
||||
"completion_tokens": 0,
|
||||
"prompt_tokens_details": null
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
You can also pass a string directly to the `input` field:
|
||||
|
||||
```bash
|
||||
curl -v "http://127.0.0.1:8000/classify" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "jason9693/Qwen2.5-1.5B-apeach",
|
||||
"input": "Loved the new café—coffee was great."
|
||||
}'
|
||||
```
|
||||
|
||||
??? console "Response"
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "classify-9bf17f2847b046c7b2d5495f4b4f9682",
|
||||
"object": "list",
|
||||
"created": 1745383213,
|
||||
"model": "jason9693/Qwen2.5-1.5B-apeach",
|
||||
"data": [
|
||||
{
|
||||
"index": 0,
|
||||
"label": "Default",
|
||||
"probs": [
|
||||
0.565970778465271,
|
||||
0.4340292513370514
|
||||
],
|
||||
"num_classes": 2
|
||||
}
|
||||
],
|
||||
"usage": {
|
||||
"prompt_tokens": 10,
|
||||
"total_tokens": 10,
|
||||
"completion_tokens": 0,
|
||||
"prompt_tokens_details": null
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## More examples
|
||||
|
||||
More examples can be found here: [examples/pooling/classify](../../../examples/pooling/classify)
|
||||
|
||||
## Supported Features
|
||||
|
||||
### Enable/disable activation
|
||||
|
||||
You can enable or disable activation via `use_activation`.
|
||||
|
||||
### Problem type (e.g. `multi_label_classification`)
|
||||
|
||||
You can modify the `problem_type` via problem_type in the Hugging Face config. The supported problem types are: `single_label_classification`, `multi_label_classification`, and `regression`.
|
||||
|
||||
Implement alignment with transformers [ForSequenceClassificationLoss](https://github.com/huggingface/transformers/blob/57bb6db6ee4cfaccc45b8d474dfad5a17811ca60/src/transformers/loss/loss_utils.py#L92).
|
||||
|
||||
### Logit bias
|
||||
|
||||
You can modify the `logit_bias` (aka `sigmoid_normalize`) through the logit_bias parameter in `vllm.config.PoolerConfig`.
|
||||
|
||||
## Removed Features
|
||||
|
||||
### Remove softmax from PoolingParams
|
||||
|
||||
We have already removed `softmax` and `activation` from PoolingParams. Instead, use `use_activation`, since we allow `classify` and `token_classify` to use any activation function.
|
||||
546
docs/models/pooling_models/embed.md
Normal file
546
docs/models/pooling_models/embed.md
Normal file
@@ -0,0 +1,546 @@
|
||||
# Embedding Usages
|
||||
|
||||
Embedding models are a class of machine learning models designed to transform unstructured data—such as text, images, or audio—into a structured numerical representation known as an embedding.
|
||||
|
||||
## Summary
|
||||
|
||||
- Model Usage: (sequence) embedding
|
||||
- Pooling Task: `embed`
|
||||
- Offline APIs:
|
||||
- `LLM.embed(...)`
|
||||
- `LLM.encode(..., pooling_task="embed")`
|
||||
- `LLM.score(...)`
|
||||
- Online APIs:
|
||||
- [Cohere Embed API](embed.md#cohere-embed-api) (`/v2/embed`)
|
||||
- [Openai-compatible Embeddings API](embed.md#openai-compatible-embeddings-api) (`/v1/embeddings`)
|
||||
- Pooling API (`/pooling`)
|
||||
|
||||
The primary distinction between (sequence) embedding and token embedding lies in their output granularity: (sequence) embedding produces a single embedding vector for an entire input sequence, whereas token embedding generates an embedding for each individual token within the sequence.
|
||||
|
||||
Many embedding models support both (sequence) embedding and token embedding. For further details on token embedding, please refer to [this page](token_embed.md).
|
||||
|
||||
## Typical Use Cases
|
||||
|
||||
### Embedding
|
||||
|
||||
The most basic use case of embedding models is to embed the inputs, e.g. for RAG.
|
||||
|
||||
### Pairwise Similarity
|
||||
|
||||
You can compute pairwise similarity scores to build a similarity matrix using the [Score API](scoring.md).
|
||||
|
||||
## Supported Models
|
||||
|
||||
--8<-- [start:supported-embed-models]
|
||||
|
||||
### Text-only Models
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) |
|
||||
| ------------ | ------ | ----------------- | ------------------------------ | ------------------------------------------ |
|
||||
| `BertModel` | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | |
|
||||
| `BertSpladeSparseEmbeddingModel` | SPLADE | `naver/splade-v3` | | |
|
||||
| `ErnieModel` | BERT-like Chinese ERNIE | `shibing624/text2vec-base-chinese-sentence` | | |
|
||||
| `Gemma2Model`<sup>C</sup> | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | ✅︎ |
|
||||
| `Gemma3TextModel`<sup>C</sup> | Gemma 3-based | `google/embeddinggemma-300m`, etc. | ✅︎ | ✅︎ |
|
||||
| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ |
|
||||
| `GteModel` | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. | | |
|
||||
| `GteNewModel` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | | |
|
||||
| `LlamaBidirectionalModel`<sup>C</sup> | Llama-based with bidirectional attention | `nvidia/llama-nemotron-embed-1b-v2`, etc. | ✅︎ | ✅︎ |
|
||||
| `LlamaModel`<sup>C</sup>, `LlamaForCausalLM`<sup>C</sup>, `MistralModel`<sup>C</sup>, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `ModernBertModel` | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | | |
|
||||
| `NomicBertModel` | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | | |
|
||||
| `Qwen2Model`<sup>C</sup>, `Qwen2ForCausalLM`<sup>C</sup> | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen3Model`<sup>C</sup>, `Qwen3ForCausalLM`<sup>C</sup> | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ |
|
||||
| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | |
|
||||
| `VoyageQwen3BidirectionalEmbedModel`<sup>C</sup> | Voyage Qwen3-based with bidirectional attention | `voyageai/voyage-4-nano`, etc. | ✅︎ | ✅︎ |
|
||||
| `XLMRobertaModel` | XLMRobertaModel-based | `BAAI/bge-m3` (see note), `intfloat/multilingual-e5-base`, `jinaai/jina-embeddings-v3` (see note), etc. | | |
|
||||
| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |
|
||||
|
||||
!!! note
|
||||
The second-generation GTE model (mGTE-TRM) is named `NewModel`. The name `NewModel` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewModel"]}'` to specify the use of the `GteNewModel` architecture.
|
||||
|
||||
!!! note
|
||||
`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
|
||||
You need to manually set mean pooling by passing `--pooler-config '{"pooling_type": "MEAN"}'`.
|
||||
|
||||
!!! note
|
||||
For `Alibaba-NLP/gte-Qwen2-*`, you need to enable `--trust-remote-code` for the correct tokenizer to be loaded.
|
||||
See [relevant issue on HF Transformers](https://github.com/huggingface/transformers/issues/34882).
|
||||
|
||||
!!! note
|
||||
The `BAAI/bge-m3` model comes with extra weights for sparse and colbert embeddings, See [this page](specific_models.md#baaibge-m3) for more information.
|
||||
|
||||
!!! note
|
||||
`jinaai/jina-embeddings-v3` supports multiple tasks through LoRA, while vllm temporarily only supports text-matching tasks by merging LoRA weights.
|
||||
|
||||
### Multimodal Models
|
||||
|
||||
!!! note
|
||||
For more information about multimodal models inputs, see [this page](../supported_models.md#list-of-multimodal-language-models).
|
||||
|
||||
| Architecture | Models | Inputs | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) |
|
||||
| ------------ | ------ | ------ | ----------------- | ------------------------------ | ------------------------------------------ |
|
||||
| `CLIPModel` | CLIP | T / I | `openai/clip-vit-base-patch32`, `openai/clip-vit-large-patch14`, etc. | | |
|
||||
| `LlamaNemotronVLModel` | Llama Nemotron Embedding + SigLIP | T + I | `nvidia/llama-nemotron-embed-vl-1b-v2` | | |
|
||||
| `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ |
|
||||
| `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ |
|
||||
| `Qwen3VLForConditionalGeneration`<sup>C</sup> | Qwen3-VL | T + I + V | `Qwen/Qwen3-VL-Embedding-2B`, etc. | ✅︎ | ✅︎ |
|
||||
| `SiglipModel` | SigLIP, SigLIP2 | T / I | `google/siglip-base-patch16-224`, `google/siglip2-base-patch16-224` | | |
|
||||
| `*ForConditionalGeneration`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | \* | N/A | \* | \* |
|
||||
|
||||
<sup>C</sup> Automatically converted into an embedding model via `--convert embed`. ([details](./README.md#model-conversion))
|
||||
\* Feature support is the same as that of the original model.
|
||||
|
||||
If your model is not in the above list, we will try to automatically convert the model using
|
||||
[as_embedding_model][vllm.model_executor.models.adapters.as_embedding_model]. By default, the embeddings
|
||||
of the whole prompt are extracted from the normalized hidden state corresponding to the last token.
|
||||
|
||||
!!! note
|
||||
Although vLLM supports automatically converting models of any architecture into embedding models via --convert embed, to get the best results, you should use pooling models that are specifically trained as such.
|
||||
|
||||
--8<-- [end:supported-embed-models]
|
||||
|
||||
## Offline Inference
|
||||
|
||||
### Pooling Parameters
|
||||
|
||||
The following [pooling parameters][vllm.PoolingParams] are supported.
|
||||
|
||||
```python
|
||||
--8<-- "vllm/pooling_params.py:common-pooling-params"
|
||||
--8<-- "vllm/pooling_params.py:embed-pooling-params"
|
||||
```
|
||||
|
||||
### `LLM.embed`
|
||||
|
||||
The [embed][vllm.LLM.embed] method outputs an embedding vector for each prompt.
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="intfloat/e5-small", runner="pooling")
|
||||
(output,) = llm.embed("Hello, my name is")
|
||||
|
||||
embeds = output.outputs.embedding
|
||||
print(f"Embeddings: {embeds!r} (size={len(embeds)})")
|
||||
```
|
||||
|
||||
A code example can be found here: [examples/offline_inference/basic/embed.py](../../../examples/basic/offline_inference/embed.py)
|
||||
|
||||
### `LLM.encode`
|
||||
|
||||
The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM.
|
||||
|
||||
Set `pooling_task="embed"` when using `LLM.encode` for embedding Models:
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="intfloat/e5-small", runner="pooling")
|
||||
(output,) = llm.encode("Hello, my name is", pooling_task="embed")
|
||||
|
||||
data = output.outputs.data
|
||||
print(f"Data: {data!r}")
|
||||
```
|
||||
|
||||
### `LLM.score`
|
||||
|
||||
The [score][vllm.LLM.score] method outputs similarity scores between sentence pairs.
|
||||
|
||||
All models that support embedding task also support using the score API to compute similarity scores by calculating the cosine similarity of two input prompt's embeddings.
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="intfloat/e5-small", runner="pooling")
|
||||
(output,) = llm.score(
|
||||
"What is the capital of France?",
|
||||
"The capital of Brazil is Brasilia.",
|
||||
)
|
||||
|
||||
score = output.outputs.score
|
||||
print(f"Score: {score}")
|
||||
```
|
||||
|
||||
## Online Serving
|
||||
|
||||
### OpenAI-Compatible Embeddings API
|
||||
|
||||
Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings);
|
||||
you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
|
||||
|
||||
Code example: [examples/pooling/embed/openai_embedding_client.py](../../../examples/pooling/embed/openai_embedding_client.py)
|
||||
|
||||
#### Completion Parameters
|
||||
|
||||
The following Classification API parameters are supported:
|
||||
|
||||
??? code
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:completion-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:encoding-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:embed-params"
|
||||
```
|
||||
|
||||
The following extra parameters are supported:
|
||||
|
||||
??? code
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:completion-extra-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:encoding-extra-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:embed-extra-params"
|
||||
```
|
||||
|
||||
#### Chat Parameters
|
||||
|
||||
For chat-like input (i.e. if `messages` is passed), the following parameters are supported:
|
||||
|
||||
??? code
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:chat-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:encoding-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:embed-params"
|
||||
```
|
||||
|
||||
these extra parameters are supported instead:
|
||||
|
||||
??? code
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:chat-extra-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:encoding-extra-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:embed-extra-params"
|
||||
```
|
||||
|
||||
#### Examples
|
||||
|
||||
If the model has a [chat template](../../serving/openai_compatible_server.md#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat API](../../serving/openai_compatible_server.md#chat-api))
|
||||
which will be treated as a single prompt to the model. Here is a convenience function for calling the API while retaining OpenAI's type annotations:
|
||||
|
||||
??? code
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
from openai._types import NOT_GIVEN, NotGiven
|
||||
from openai.types.chat import ChatCompletionMessageParam
|
||||
from openai.types.create_embedding_response import CreateEmbeddingResponse
|
||||
|
||||
def create_chat_embeddings(
|
||||
client: OpenAI,
|
||||
*,
|
||||
messages: list[ChatCompletionMessageParam],
|
||||
model: str,
|
||||
encoding_format: Union[Literal["base64", "float"], NotGiven] = NOT_GIVEN,
|
||||
) -> CreateEmbeddingResponse:
|
||||
return client.post(
|
||||
"/embeddings",
|
||||
cast_to=CreateEmbeddingResponse,
|
||||
body={"messages": messages, "model": model, "encoding_format": encoding_format},
|
||||
)
|
||||
```
|
||||
|
||||
##### Multi-modal inputs
|
||||
|
||||
You can pass multi-modal inputs to embedding models by defining a custom chat template for the server
|
||||
and passing a list of `messages` in the request. Refer to the examples below for illustration.
|
||||
|
||||
=== "VLM2Vec"
|
||||
|
||||
To serve the model:
|
||||
|
||||
```bash
|
||||
vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling \
|
||||
--trust-remote-code \
|
||||
--max-model-len 4096 \
|
||||
--chat-template examples/pooling/embed/template/vlm2vec_phi3v.jinja
|
||||
```
|
||||
|
||||
!!! important
|
||||
Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--runner pooling`
|
||||
to run this model in embedding mode instead of text generation mode.
|
||||
|
||||
The custom chat template is completely different from the original one for this model,
|
||||
and can be found here: [examples/pooling/embed/template/vlm2vec_phi3v.jinja](../../../examples/pooling/embed/template/vlm2vec_phi3v.jinja)
|
||||
|
||||
Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
|
||||
|
||||
??? code
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
client = OpenAI(
|
||||
base_url="http://localhost:8000/v1",
|
||||
api_key="EMPTY",
|
||||
)
|
||||
image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||
|
||||
response = create_chat_embeddings(
|
||||
client,
|
||||
model="TIGER-Lab/VLM2Vec-Full",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
{"type": "text", "text": "Represent the given image."},
|
||||
],
|
||||
}
|
||||
],
|
||||
encoding_format="float",
|
||||
)
|
||||
|
||||
print("Image embedding output:", response.data[0].embedding)
|
||||
```
|
||||
|
||||
=== "DSE-Qwen2-MRL"
|
||||
|
||||
To serve the model:
|
||||
|
||||
```bash
|
||||
vllm serve MrLight/dse-qwen2-2b-mrl-v1 --runner pooling \
|
||||
--trust-remote-code \
|
||||
--max-model-len 8192 \
|
||||
--chat-template examples/pooling/embed/template/dse_qwen2_vl.jinja
|
||||
```
|
||||
|
||||
!!! important
|
||||
Like with VLM2Vec, we have to explicitly pass `--runner pooling`.
|
||||
|
||||
Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled
|
||||
by a custom chat template: [examples/pooling/embed/template/dse_qwen2_vl.jinja](../../../examples/pooling/embed/template/dse_qwen2_vl.jinja)
|
||||
|
||||
!!! important
|
||||
`MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
|
||||
example below for details.
|
||||
|
||||
Full example: [examples/pooling/embed/vision_embedding_online.py](../../../examples/pooling/embed/vision_embedding_online.py)
|
||||
|
||||
### Cohere Embed API
|
||||
|
||||
Our API is also compatible with [Cohere's Embed v2 API](https://docs.cohere.com/reference/embed) which adds support for some modern embedding feature such as truncation, output dimensions, embedding types, and input types. This endpoint works with any embedding model (including multimodal models).
|
||||
|
||||
#### Cohere Embed API request parameters
|
||||
|
||||
| Parameter | Type | Required | Description |
|
||||
| --------- | ---- | -------- | ----------- |
|
||||
| `model` | string | Yes | Model name |
|
||||
| `input_type` | string | No | Prompt prefix key (model-dependent, see below) |
|
||||
| `texts` | list[string] | No | Text inputs (use one of `texts`, `images`, or `inputs`) |
|
||||
| `images` | list[string] | No | Base64 data URI images |
|
||||
| `inputs` | list[object] | No | Mixed text and image content objects |
|
||||
| `embedding_types` | list[string] | No | Output types (default: `["float"]`) |
|
||||
| `output_dimension` | int | No | Truncate embeddings to this dimension (Matryoshka) |
|
||||
| `truncate` | string | No | `END`, `START`, or `NONE` (default: `END`) |
|
||||
|
||||
#### Text embedding
|
||||
|
||||
```bash
|
||||
curl -X POST "http://localhost:8000/v2/embed" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "Snowflake/snowflake-arctic-embed-m-v1.5",
|
||||
"input_type": "query",
|
||||
"texts": ["Hello world", "How are you?"],
|
||||
"embedding_types": ["float"]
|
||||
}'
|
||||
```
|
||||
|
||||
??? console "Response"
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "embd-...",
|
||||
"embeddings": {
|
||||
"float": [
|
||||
[0.012, -0.034, ...],
|
||||
[0.056, 0.078, ...]
|
||||
]
|
||||
},
|
||||
"texts": ["Hello world", "How are you?"],
|
||||
"meta": {
|
||||
"api_version": {"version": "2"},
|
||||
"billed_units": {"input_tokens": 12}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### Mixed text and image inputs
|
||||
|
||||
For multimodal models, you can embed images by passing base64 data URIs. The `inputs` field accepts a list of objects with mixed text and image content:
|
||||
|
||||
```bash
|
||||
curl -X POST "http://localhost:8000/v2/embed" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "google/siglip-so400m-patch14-384",
|
||||
"inputs": [
|
||||
{
|
||||
"content": [
|
||||
{"type": "text", "text": "A photo of a cat"},
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,iVBOR..."}}
|
||||
]
|
||||
}
|
||||
],
|
||||
"embedding_types": ["float"]
|
||||
}'
|
||||
```
|
||||
|
||||
#### Embedding types
|
||||
|
||||
The `embedding_types` parameter controls the output format. Multiple types can be requested in a single call:
|
||||
|
||||
| Type | Description |
|
||||
| ---- | ----------- |
|
||||
| `float` | Raw float32 embeddings (default) |
|
||||
| `binary` | Bit-packed signed binary |
|
||||
| `ubinary` | Bit-packed unsigned binary |
|
||||
| `base64` | Little-endian float32 encoded as base64 |
|
||||
|
||||
```bash
|
||||
curl -X POST "http://localhost:8000/v2/embed" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "Snowflake/snowflake-arctic-embed-m-v1.5",
|
||||
"input_type": "query",
|
||||
"texts": ["What is machine learning?"],
|
||||
"embedding_types": ["float", "binary"]
|
||||
}'
|
||||
```
|
||||
|
||||
??? console "Response"
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "embd-...",
|
||||
"embeddings": {
|
||||
"float": [[0.012, -0.034, ...]],
|
||||
"binary": [[42, -117, ...]]
|
||||
},
|
||||
"texts": ["What is machine learning?"],
|
||||
"meta": {
|
||||
"api_version": {"version": "2"},
|
||||
"billed_units": {"input_tokens": 8}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### Truncation
|
||||
|
||||
The `truncate` parameter controls how inputs exceeding the model's maximum sequence length are handled:
|
||||
|
||||
| Value | Behavior |
|
||||
| ----- | --------- |
|
||||
| `END` (default) | Keep the first tokens, drop the end |
|
||||
| `START` | Keep the last tokens, drop the beginning |
|
||||
| `NONE` | Return an error if the input is too long |
|
||||
|
||||
#### Input type and prompt prefixes
|
||||
|
||||
The `input_type` field selects a prompt prefix to prepend to each text input. The available values
|
||||
depend on the model:
|
||||
|
||||
- **Models with `task_instructions` in `config.json`**: The keys from the `task_instructions` dict are
|
||||
the valid `input_type` values and the corresponding value is prepended to each text.
|
||||
- **Models with `config_sentence_transformers.json` prompts**: The keys from the `prompts` dict are
|
||||
the valid `input_type` values. For example, `Snowflake/snowflake-arctic-embed-xs` defines `"query"`,
|
||||
so setting `input_type: "query"` prepends `"Represent this sentence for searching relevant passages: "`.
|
||||
- **Other models**: `input_type` is not accepted and will raise a validation error if passed.
|
||||
|
||||
## More examples
|
||||
|
||||
More examples can be found here: [examples/pooling/embed](../../../examples/pooling/embed)
|
||||
|
||||
## Supported Features
|
||||
|
||||
### Enable/disable normalize
|
||||
|
||||
You can enable or disable normalize via `use_activation`.
|
||||
|
||||
### Matryoshka Embeddings
|
||||
|
||||
[Matryoshka Embeddings](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings) or [Matryoshka Representation Learning (MRL)](https://arxiv.org/abs/2205.13147) is a technique used in training embedding models. It allows users to trade off between performance and cost.
|
||||
|
||||
!!! warning
|
||||
Not all embedding models are trained using Matryoshka Representation Learning. To avoid misuse of the `dimensions` parameter, vLLM returns an error for requests that attempt to change the output dimension of models that do not support Matryoshka Embeddings.
|
||||
|
||||
For example, setting `dimensions` parameter while using the `BAAI/bge-m3` model will result in the following error.
|
||||
|
||||
```json
|
||||
{"object":"error","message":"Model \"BAAI/bge-m3\" does not support matryoshka representation, changing output dimensions will lead to poor results.","type":"BadRequestError","param":null,"code":400}
|
||||
```
|
||||
|
||||
#### Manually enable Matryoshka Embeddings
|
||||
|
||||
There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, if `is_matryoshka` is `True` in `config.json`, you can change the output dimension to arbitrary values. Use `matryoshka_dimensions` to control the allowed output dimensions.
|
||||
|
||||
For models that support Matryoshka Embeddings but are not recognized by vLLM, manually override the config using `hf_overrides={"is_matryoshka": True}` or `hf_overrides={"matryoshka_dimensions": [<allowed output dimensions>]}` (offline), or `--hf-overrides '{"is_matryoshka": true}'` or `--hf-overrides '{"matryoshka_dimensions": [<allowed output dimensions>]}'` (online).
|
||||
|
||||
Here is an example to serve a model with Matryoshka Embeddings enabled.
|
||||
|
||||
```bash
|
||||
vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf-overrides '{"matryoshka_dimensions":[256]}'
|
||||
```
|
||||
|
||||
#### Offline Inference
|
||||
|
||||
You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter in [PoolingParams][vllm.PoolingParams].
|
||||
|
||||
```python
|
||||
from vllm import LLM, PoolingParams
|
||||
|
||||
llm = LLM(
|
||||
model="jinaai/jina-embeddings-v3",
|
||||
runner="pooling",
|
||||
trust_remote_code=True,
|
||||
)
|
||||
outputs = llm.embed(
|
||||
["Follow the white rabbit."],
|
||||
pooling_params=PoolingParams(dimensions=32),
|
||||
)
|
||||
print(outputs[0].outputs)
|
||||
```
|
||||
|
||||
A code example can be found here: [examples/pooling/embed/embed_matryoshka_fy_offline.py](../../../examples/pooling/embed/embed_matryoshka_fy_offline.py)
|
||||
|
||||
#### Online Inference
|
||||
|
||||
Use the following command to start the vLLM server.
|
||||
|
||||
```bash
|
||||
vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
|
||||
```
|
||||
|
||||
You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter.
|
||||
|
||||
```bash
|
||||
curl http://127.0.0.1:8000/v1/embeddings \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"input": "Follow the white rabbit.",
|
||||
"model": "jinaai/jina-embeddings-v3",
|
||||
"encoding_format": "float",
|
||||
"dimensions": 32
|
||||
}'
|
||||
```
|
||||
|
||||
Expected output:
|
||||
|
||||
```json
|
||||
{"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}}
|
||||
```
|
||||
|
||||
An OpenAI client example can be found here: [examples/pooling/embed/openai_embedding_matryoshka_fy_client.py](../../../examples/pooling/embed/openai_embedding_matryoshka_fy_client.py)
|
||||
|
||||
## Removed Features
|
||||
|
||||
### Remove `normalize` from PoolingParams
|
||||
|
||||
We have already removed `normalize` from PoolingParams, use `use_activation` instead.
|
||||
136
docs/models/pooling_models/reward.md
Normal file
136
docs/models/pooling_models/reward.md
Normal file
@@ -0,0 +1,136 @@
|
||||
# Reward Usages
|
||||
|
||||
A reward model (RM) is designed to evaluate and score the quality of outputs generated by a language model, acting as a proxy for human preferences.
|
||||
|
||||
## Summary
|
||||
|
||||
- Model Usage: reward
|
||||
- Pooling Task:
|
||||
|
||||
| Model Types | Pooling Tasks |
|
||||
|------------------------------------|----------------|
|
||||
| (sequence) (outcome) reward models | classify |
|
||||
| token (outcome) reward models | token_classify |
|
||||
| process reward models | token_classify |
|
||||
|
||||
- Offline APIs:
|
||||
- `LLM.encode(..., pooling_task="...")`
|
||||
- Online APIs:
|
||||
- Pooling API (`/pooling`)
|
||||
|
||||
## Supported Models
|
||||
|
||||
### Reward Models
|
||||
|
||||
Using sequence classification models as (sequence) (outcome) reward models, the usage and supported features are the same as for normal [classification models](classify.md).
|
||||
|
||||
--8<-- [start:supported-sequence-reward-models]
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) |
|
||||
| ------------ | ------ | ----------------- | -------------------- | ------------------------- |
|
||||
| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen3ForSequenceClassification`<sup>C</sup> | Qwen3-based | `Skywork/Skywork-Reward-V2-Qwen3-0.6B`, etc. | ✅︎ | ✅︎ |
|
||||
| `LlamaForSequenceClassification`<sup>C</sup> | Llama-based | `Skywork/Skywork-Reward-V2-Llama-3.2-1B`, etc. | ✅︎ | ✅︎ |
|
||||
| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |
|
||||
|
||||
<sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./README.md#model-conversion))
|
||||
|
||||
If your model is not in the above list, we will try to automatically convert the model using
|
||||
[as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
|
||||
|
||||
--8<-- [end:supported-sequence-reward-models]
|
||||
|
||||
### Token Reward Models
|
||||
|
||||
The key distinction between (sequence) classification and token classification lies in their output granularity: (sequence) classification produces a single result for an entire input sequence, whereas token classification yields a result for each individual token within the sequence.
|
||||
|
||||
Using token classification models as token (outcome) reward models, the usage and supported features are the same as for normal [token classification models](token_classify.md).
|
||||
|
||||
--8<-- [start:supported-token-reward-models]
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) |
|
||||
| ------------ | ------ | ----------------- | -------------------- | ------------------------- |
|
||||
| `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ |
|
||||
| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |
|
||||
|
||||
<sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./README.md#model-conversion))
|
||||
|
||||
If your model is not in the above list, we will try to automatically convert the model using
|
||||
[as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model].
|
||||
|
||||
--8<-- [end:supported-token-reward-models]
|
||||
|
||||
### Process Reward Models
|
||||
|
||||
The process reward models used for evaluating intermediate steps are crucial to achieving the desired outcome.
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) |
|
||||
| ------------ | ------ | ----------------- | -------------------- | ------------------------- |
|
||||
| `LlamaForCausalLM` | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B`, etc. | ✅︎ | ✅︎ |
|
||||
|
||||
!!! important
|
||||
For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
|
||||
e.g.: `--pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
|
||||
|
||||
## Offline Inference
|
||||
|
||||
### Pooling Parameters
|
||||
|
||||
The following [pooling parameters][vllm.PoolingParams] are supported.
|
||||
|
||||
```python
|
||||
--8<-- "vllm/pooling_params.py:common-pooling-params"
|
||||
--8<-- "vllm/pooling_params.py:classify-pooling-params"
|
||||
```
|
||||
|
||||
### `LLM.encode`
|
||||
|
||||
The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM.
|
||||
|
||||
- Reward Models
|
||||
|
||||
Set `pooling_task="classify"` when using `LLM.encode` for (sequence) (outcome) reward models:
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="Skywork/Skywork-Reward-V2-Qwen3-0.6B", runner="pooling")
|
||||
(output,) = llm.encode("Hello, my name is", pooling_task="classify")
|
||||
|
||||
data = output.outputs.data
|
||||
print(f"Data: {data!r}")
|
||||
```
|
||||
|
||||
- Token Reward Models
|
||||
|
||||
Set `pooling_task="token_classify"` when using `LLM.encode` for token (outcome) reward models:
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="internlm/internlm2-1_8b-reward", runner="pooling", trust_remote_code=True)
|
||||
(output,) = llm.encode("Hello, my name is", pooling_task="token_classify")
|
||||
|
||||
data = output.outputs.data
|
||||
print(f"Data: {data!r}")
|
||||
```
|
||||
|
||||
- Process Reward Models
|
||||
|
||||
Set `pooling_task="token_classify"` when using `LLM.encode` for token (outcome) reward models:
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="Qwen/Qwen2.5-Math-PRM-7B", runner="pooling")
|
||||
(output,) = llm.encode("Hello, my name is<extra_0><extra_0><extra_0>", pooling_task="token_classify")
|
||||
|
||||
data = output.outputs.data
|
||||
print(f"Data: {data!r}")
|
||||
```
|
||||
|
||||
## Online Serving
|
||||
|
||||
Please refer to the [pooling API](README.md#pooling-api). Pooling task corresponding to reward model types refer to the [table above](#summary).
|
||||
451
docs/models/pooling_models/scoring.md
Normal file
451
docs/models/pooling_models/scoring.md
Normal file
@@ -0,0 +1,451 @@
|
||||
# Scoring Usages
|
||||
|
||||
The score models is designed to compute similarity scores between two input prompts. It supports three model types (aka `score_type`): `cross-encoder`, `late-interaction`, and `bi-encoder`.
|
||||
|
||||
!!! note
|
||||
vLLM handles only the model inference component of RAG pipelines (such as embedding generation and reranking). For higher-level RAG orchestration, you should leverage integration frameworks like [LangChain](https://github.com/langchain-ai/langchain).
|
||||
|
||||
## Summary
|
||||
|
||||
- Model Usage: Scoring
|
||||
- Pooling Task:
|
||||
|
||||
| Score Types | Pooling Tasks | scoring function |
|
||||
|--------------------|-----------------------|--------------------------|
|
||||
| `cross-encoder` | `classify` (see note) | linear classifier |
|
||||
| `late-interaction` | `token_embed` | late interaction(MaxSim) |
|
||||
| `bi-encoder` | `embed` | cosine similarity |
|
||||
|
||||
- Offline APIs:
|
||||
- `LLM.score`
|
||||
- Online APIs:
|
||||
- [Score API](scoring.md#score-api) (`/score`)
|
||||
- [Rerank API](scoring.md#rerank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`)
|
||||
|
||||
!!! note
|
||||
Only when a classification model outputs num_labels equal to 1 can it be used as a scoring model and have its scoring API enabled.
|
||||
|
||||
## Supported Models
|
||||
|
||||
### Cross-encoder models
|
||||
|
||||
[Cross-encoder](https://www.sbert.net/examples/applications/cross-encoder/README.html) (aka reranker) models are a subset of classification models that accept two prompts as input and output num_labels equal to 1.
|
||||
|
||||
--8<-- [start:supported-cross-encoder-models]
|
||||
|
||||
#### Text-only Models
|
||||
|
||||
| Architecture | Models | Example HF Models | Score template (see note) | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) |
|
||||
| ------------ | ------ | ----------------- | ------------------------- | --------------------------- | --------------------------------------- |
|
||||
| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | N/A | | |
|
||||
| `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma`(see note), etc. | [bge-reranker-v2-gemma.jinja](../../../examples/pooling/score/template/bge-reranker-v2-gemma.jinja) | ✅︎ | ✅︎ |
|
||||
| `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. | N/A | | |
|
||||
| `LlamaBidirectionalForSequenceClassification`<sup>C</sup> | Llama-based with bidirectional attention | `nvidia/llama-nemotron-rerank-1b-v2`, etc. | [nemotron-rerank.jinja](../../../examples/pooling/score/template/nemotron-rerank.jinja) | ✅︎ | ✅︎ |
|
||||
| `Qwen2ForSequenceClassification`<sup>C</sup> | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2`(see note), etc. | [mxbai_rerank_v2.jinja](../../../examples/pooling/score/template/mxbai_rerank_v2.jinja) | ✅︎ | ✅︎ |
|
||||
| `Qwen3ForSequenceClassification`<sup>C</sup> | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B`(see note), etc. | [qwen3_reranker.jinja](../../../examples/pooling/score/template/qwen3_reranker.jinja) | ✅︎ | ✅︎ |
|
||||
| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | N/A | | |
|
||||
| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | N/A | | |
|
||||
| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | N/A | \* | \* |
|
||||
|
||||
<sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./README.md#model-conversion))
|
||||
\* Feature support is the same as that of the original model.
|
||||
|
||||
!!! note
|
||||
Some models require a specific prompt format to work correctly.
|
||||
|
||||
You can find Example HF Models's corresponding score template in [examples/pooling/score/template/](../../../examples/pooling/score/template)
|
||||
|
||||
Examples : [examples/pooling/score/using_template_offline.py](../../../examples/pooling/score/using_template_offline.py) [examples/pooling/score/using_template_online.py](../../../examples/pooling/score/using_template_online.py)
|
||||
|
||||
!!! note
|
||||
Load the official original `BAAI/bge-reranker-v2-gemma` by using the following command.
|
||||
|
||||
```bash
|
||||
vllm serve BAAI/bge-reranker-v2-gemma --hf_overrides '{"architectures": ["GemmaForSequenceClassification"],"classifier_from_token": ["Yes"],"method": "no_post_processing"}'
|
||||
```
|
||||
|
||||
!!! note
|
||||
The second-generation GTE model (mGTE-TRM) is named `NewForSequenceClassification`. The name `NewForSequenceClassification` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewForSequenceClassification"]}'` to specify the use of the `GteNewForSequenceClassification` architecture.
|
||||
|
||||
!!! note
|
||||
Load the official original `mxbai-rerank-v2` by using the following command.
|
||||
|
||||
```bash
|
||||
vllm serve mixedbread-ai/mxbai-rerank-base-v2 --hf_overrides '{"architectures": ["Qwen2ForSequenceClassification"],"classifier_from_token": ["0", "1"], "method": "from_2_way_softmax"}'
|
||||
```
|
||||
|
||||
!!! note
|
||||
Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/pooling/score/qwen3_reranker_offline.py](../../../examples/pooling/score/qwen3_reranker_offline.py) [examples/pooling/score/qwen3_reranker_online.py](../../../examples/pooling/score/qwen3_reranker_online.py).
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
|
||||
```
|
||||
|
||||
#### Multimodal Models
|
||||
|
||||
!!! note
|
||||
For more information about multimodal models inputs, see [this page](../supported_models.md#list-of-multimodal-language-models).
|
||||
|
||||
| Architecture | Models | Inputs | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) |
|
||||
| ------------ | ------ | ------ | ----------------- | ------------------------------ | ------------------------------------------ |
|
||||
| `JinaVLForSequenceClassification` | JinaVL-based | T + I<sup>E+</sup> | `jinaai/jina-reranker-m0`, etc. | ✅︎ | ✅︎ |
|
||||
| `LlamaNemotronVLForSequenceClassification` | Llama Nemotron Reranker + SigLIP | T + I<sup>E+</sup> | `nvidia/llama-nemotron-rerank-vl-1b-v2` | | |
|
||||
| `Qwen3VLForSequenceClassification` | Qwen3-VL-Reranker | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-Reranker-2B`(see note), etc. | ✅︎ | ✅︎ |
|
||||
|
||||
<sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](README.md#model-conversion))
|
||||
\* Feature support is the same as that of the original model.
|
||||
|
||||
!!! note
|
||||
Similar to Qwen3-Reranker, you need to use the following `--hf_overrides` to load the official original `Qwen3-VL-Reranker`.
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/Qwen3-VL-Reranker-2B --hf_overrides '{"architectures": ["Qwen3VLForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
|
||||
```
|
||||
|
||||
--8<-- [end:supported-cross-encoder-models]
|
||||
|
||||
### Late-interaction models
|
||||
|
||||
All models that support token embedding task also support using the score API to compute similarity scores by calculating the late interaction of two input prompts. See [this page](token_embed.md) for more information about token embedding models.
|
||||
|
||||
--8<-- "docs/models/pooling_models/token_embed.md:supported-token-embed-models"
|
||||
|
||||
### Bi-encoder
|
||||
|
||||
All models that support embedding task also support using the score API to compute similarity scores by calculating the cosine similarity of two input prompt's embeddings. See [this page](embed.md) for more information about embedding models.
|
||||
|
||||
--8<-- "docs/models/pooling_models/embed.md:supported-embed-models"
|
||||
|
||||
## Offline Inference
|
||||
|
||||
### Pooling Parameters
|
||||
|
||||
The following [pooling parameters][vllm.PoolingParams] are only supported by cross-encoder models and do not work for late-interaction and bi-encoder models.
|
||||
|
||||
```python
|
||||
--8<-- "vllm/pooling_params.py:common-pooling-params"
|
||||
--8<-- "vllm/pooling_params.py:classify-pooling-params"
|
||||
```
|
||||
|
||||
### `LLM.score`
|
||||
|
||||
The [score][vllm.LLM.score] method outputs similarity scores between sentence pairs.
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="BAAI/bge-reranker-v2-m3", runner="pooling")
|
||||
(output,) = llm.score(
|
||||
"What is the capital of France?",
|
||||
"The capital of Brazil is Brasilia.",
|
||||
)
|
||||
|
||||
score = output.outputs.score
|
||||
print(f"Score: {score}")
|
||||
```
|
||||
|
||||
A code example can be found here: [examples/basic/offline_inference/score.py](../../../examples/basic/offline_inference/score.py)
|
||||
|
||||
## Online Serving
|
||||
|
||||
### Score API
|
||||
|
||||
Our Score API (`/score`) is similar to `LLM.score`, compute similarity scores between two input prompts.
|
||||
|
||||
#### Parameters
|
||||
|
||||
The following Score API parameters are supported:
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params"
|
||||
```
|
||||
|
||||
#### Examples
|
||||
|
||||
##### Single inference
|
||||
|
||||
You can pass a string to both `queries` and `documents`, forming a single sentence pair.
|
||||
|
||||
```bash
|
||||
curl -X 'POST' \
|
||||
'http://127.0.0.1:8000/score' \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"model": "BAAI/bge-reranker-v2-m3",
|
||||
"encoding_format": "float",
|
||||
"queries": "What is the capital of France?",
|
||||
"documents": "The capital of France is Paris."
|
||||
}'
|
||||
```
|
||||
|
||||
??? console "Response"
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "score-request-id",
|
||||
"object": "list",
|
||||
"created": 693447,
|
||||
"model": "BAAI/bge-reranker-v2-m3",
|
||||
"data": [
|
||||
{
|
||||
"index": 0,
|
||||
"object": "score",
|
||||
"score": 1
|
||||
}
|
||||
],
|
||||
"usage": {}
|
||||
}
|
||||
```
|
||||
|
||||
##### Batch inference
|
||||
|
||||
You can pass a string to `queries` and a list to `documents`, forming multiple sentence pairs
|
||||
where each pair is built from `queries` and a string in `documents`.
|
||||
The total number of pairs is `len(documents)`.
|
||||
|
||||
??? console "Request"
|
||||
|
||||
```bash
|
||||
curl -X 'POST' \
|
||||
'http://127.0.0.1:8000/score' \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"model": "BAAI/bge-reranker-v2-m3",
|
||||
"queries": "What is the capital of France?",
|
||||
"documents": [
|
||||
"The capital of Brazil is Brasilia.",
|
||||
"The capital of France is Paris."
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
??? console "Response"
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "score-request-id",
|
||||
"object": "list",
|
||||
"created": 693570,
|
||||
"model": "BAAI/bge-reranker-v2-m3",
|
||||
"data": [
|
||||
{
|
||||
"index": 0,
|
||||
"object": "score",
|
||||
"score": 0.001094818115234375
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"object": "score",
|
||||
"score": 1
|
||||
}
|
||||
],
|
||||
"usage": {}
|
||||
}
|
||||
```
|
||||
|
||||
You can pass a list to both `queries` and `documents`, forming multiple sentence pairs
|
||||
where each pair is built from a string in `queries` and the corresponding string in `documents` (similar to `zip()`).
|
||||
The total number of pairs is `len(documents)`.
|
||||
|
||||
??? console "Request"
|
||||
|
||||
```bash
|
||||
curl -X 'POST' \
|
||||
'http://127.0.0.1:8000/score' \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"model": "BAAI/bge-reranker-v2-m3",
|
||||
"encoding_format": "float",
|
||||
"queries": [
|
||||
"What is the capital of Brazil?",
|
||||
"What is the capital of France?"
|
||||
],
|
||||
"documents": [
|
||||
"The capital of Brazil is Brasilia.",
|
||||
"The capital of France is Paris."
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
??? console "Response"
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "score-request-id",
|
||||
"object": "list",
|
||||
"created": 693447,
|
||||
"model": "BAAI/bge-reranker-v2-m3",
|
||||
"data": [
|
||||
{
|
||||
"index": 0,
|
||||
"object": "score",
|
||||
"score": 1
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"object": "score",
|
||||
"score": 1
|
||||
}
|
||||
],
|
||||
"usage": {}
|
||||
}
|
||||
```
|
||||
|
||||
##### Multi-modal inputs
|
||||
|
||||
You can pass multi-modal inputs to scoring models by passing `content` including a list of multi-modal input (image, etc.) in the request. Refer to the examples below for illustration.
|
||||
|
||||
=== "JinaVL-Reranker"
|
||||
|
||||
To serve the model:
|
||||
|
||||
```bash
|
||||
vllm serve jinaai/jina-reranker-m0
|
||||
```
|
||||
|
||||
Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
|
||||
|
||||
??? Code
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
response = requests.post(
|
||||
"http://localhost:8000/v1/score",
|
||||
json={
|
||||
"model": "jinaai/jina-reranker-m0",
|
||||
"queries": "slm markdown",
|
||||
"documents": [
|
||||
{
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
|
||||
},
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
|
||||
},
|
||||
}
|
||||
]
|
||||
},
|
||||
],
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
response_json = response.json()
|
||||
print("Scoring output:", response_json["data"][0]["score"])
|
||||
print("Scoring output:", response_json["data"][1]["score"])
|
||||
```
|
||||
Full example:
|
||||
|
||||
- [examples/pooling/score/vision_score_api_online.py](../../../examples/pooling/score/vision_score_api_online.py)
|
||||
- [examples/pooling/score/vision_rerank_api_online.py](../../../examples/pooling/score/vision_rerank_api_online.py)
|
||||
|
||||
### Rerank API
|
||||
|
||||
`/rerank`, `/v1/rerank`, and `/v2/rerank` APIs are compatible with both [Jina AI's rerank API interface](https://jina.ai/reranker/) and
|
||||
[Cohere's rerank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with
|
||||
popular open-source tools.
|
||||
|
||||
Code example: [examples/pooling/score/rerank_api_online.py](../../../examples/pooling/score/rerank_api_online.py)
|
||||
|
||||
#### Parameters
|
||||
|
||||
The following rerank api parameters are supported:
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params"
|
||||
```
|
||||
|
||||
#### Examples
|
||||
|
||||
Note that the `top_n` request parameter is optional and will default to the length of the `documents` field.
|
||||
Result documents will be sorted by relevance, and the `index` property can be used to determine original order.
|
||||
|
||||
??? console "Request"
|
||||
|
||||
```bash
|
||||
curl -X 'POST' \
|
||||
'http://127.0.0.1:8000/v1/rerank' \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"model": "BAAI/bge-reranker-base",
|
||||
"query": "What is the capital of France?",
|
||||
"documents": [
|
||||
"The capital of Brazil is Brasilia.",
|
||||
"The capital of France is Paris.",
|
||||
"Horses and cows are both animals"
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
??? console "Response"
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "rerank-fae51b2b664d4ed38f5969b612edff77",
|
||||
"model": "BAAI/bge-reranker-base",
|
||||
"usage": {
|
||||
"total_tokens": 56
|
||||
},
|
||||
"results": [
|
||||
{
|
||||
"index": 1,
|
||||
"document": {
|
||||
"text": "The capital of France is Paris."
|
||||
},
|
||||
"relevance_score": 0.99853515625
|
||||
},
|
||||
{
|
||||
"index": 0,
|
||||
"document": {
|
||||
"text": "The capital of Brazil is Brasilia."
|
||||
},
|
||||
"relevance_score": 0.0005860328674316406
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
## More examples
|
||||
|
||||
More examples can be found here: [examples/pooling/score](../../../examples/pooling/score)
|
||||
|
||||
## Supported Features
|
||||
|
||||
AS cross-encoder models are a subset of classification models that accept two prompts as input and output num_labels equal to 1, cross-encoder features should be consistent with (sequence) classification. For more information, see [this page](classify.md#supported-features).
|
||||
|
||||
### Score Template
|
||||
|
||||
Score templates are supported for **cross-encoder** models only. If you are using an **embedding** model for scoring, vLLM does not apply a score template.
|
||||
|
||||
Some scoring models require a specific prompt format to work correctly. You can specify a custom score template using the `--chat-template` parameter (see [Chat Template](../../serving/openai_compatible_server.md#chat-template)).
|
||||
|
||||
Like chat templates, the score template receives a `messages` list. For scoring, each message has a `role` attribute—either `"query"` or `"document"`. For the usual kind of point-wise cross-encoder, you can expect exactly two messages: one query and one document. To access the query and document content, use Jinja's `selectattr` filter:
|
||||
|
||||
- **Query**: `{{ (messages | selectattr("role", "eq", "query") | first).content }}`
|
||||
- **Document**: `{{ (messages | selectattr("role", "eq", "document") | first).content }}`
|
||||
|
||||
This approach is more robust than index-based access (`messages[0]`, `messages[1]`) because it selects messages by their semantic role. It also avoids assumptions about message ordering if additional message types are added to `messages` in the future.
|
||||
|
||||
Example template file: [examples/pooling/score/template/nemotron-rerank.jinja](../../../examples/pooling/score/template/nemotron-rerank.jinja)
|
||||
|
||||
### Enable/disable activation
|
||||
|
||||
You can enable or disable activation via `use_activation` only works for cross-encoder models.
|
||||
400
docs/models/pooling_models/specific_models.md
Normal file
400
docs/models/pooling_models/specific_models.md
Normal file
@@ -0,0 +1,400 @@
|
||||
# Specific Model Examples
|
||||
|
||||
## ColBERT Late Interaction Models
|
||||
|
||||
[ColBERT](https://arxiv.org/abs/2004.12832) (Contextualized Late Interaction over BERT) is a retrieval model that uses per-token embeddings and MaxSim scoring for document ranking. Unlike single-vector embedding models, ColBERT retains token-level representations and computes relevance scores through late interaction, providing better accuracy while being more efficient than cross-encoders.
|
||||
|
||||
vLLM supports ColBERT models with multiple encoder backbones:
|
||||
|
||||
| Architecture | Backbone | Example HF Models |
|
||||
| - | - | - |
|
||||
| `HF_ColBERT` | BERT | `answerdotai/answerai-colbert-small-v1`, `colbert-ir/colbertv2.0` |
|
||||
| `ColBERTModernBertModel` | ModernBERT | `lightonai/GTE-ModernColBERT-v1` |
|
||||
| `ColBERTJinaRobertaModel` | Jina XLM-RoBERTa | `jinaai/jina-colbert-v2` |
|
||||
| `ColBERTLfm2Model` | LFM2 | `LiquidAI/LFM2-ColBERT-350M` |
|
||||
|
||||
**BERT-based ColBERT** models work out of the box:
|
||||
|
||||
```shell
|
||||
vllm serve answerdotai/answerai-colbert-small-v1
|
||||
```
|
||||
|
||||
For **non-BERT backbones**, use `--hf-overrides` to set the correct architecture:
|
||||
|
||||
```shell
|
||||
# ModernBERT backbone
|
||||
vllm serve lightonai/GTE-ModernColBERT-v1 \
|
||||
--hf-overrides '{"architectures": ["ColBERTModernBertModel"]}'
|
||||
|
||||
# Jina XLM-RoBERTa backbone
|
||||
vllm serve jinaai/jina-colbert-v2 \
|
||||
--hf-overrides '{"architectures": ["ColBERTJinaRobertaModel"]}' \
|
||||
--trust-remote-code
|
||||
|
||||
# LFM2 backbone
|
||||
vllm serve LiquidAI/LFM2-ColBERT-350M \
|
||||
--hf-overrides '{"architectures": ["ColBERTLfm2Model"]}'
|
||||
```
|
||||
|
||||
Then you can use the rerank API:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{
|
||||
"model": "answerdotai/answerai-colbert-small-v1",
|
||||
"query": "What is machine learning?",
|
||||
"documents": [
|
||||
"Machine learning is a subset of artificial intelligence.",
|
||||
"Python is a programming language.",
|
||||
"Deep learning uses neural networks."
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
Or the score API:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{
|
||||
"model": "answerdotai/answerai-colbert-small-v1",
|
||||
"text_1": "What is machine learning?",
|
||||
"text_2": ["Machine learning is a subset of AI.", "The weather is sunny."]
|
||||
}'
|
||||
```
|
||||
|
||||
You can also get the raw token embeddings using the pooling API with `token_embed` task:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
|
||||
"model": "answerdotai/answerai-colbert-small-v1",
|
||||
"input": "What is machine learning?",
|
||||
"task": "token_embed"
|
||||
}'
|
||||
```
|
||||
|
||||
An example can be found here: [examples/pooling/score/colbert_rerank_online.py](../../../examples/pooling/score/colbert_rerank_online.py)
|
||||
|
||||
## ColQwen3 Multi-Modal Late Interaction Models
|
||||
|
||||
ColQwen3 is based on [ColPali](https://arxiv.org/abs/2407.01449), which extends ColBERT's late interaction approach to **multi-modal** inputs. While ColBERT operates on text-only token embeddings, ColPali/ColQwen3 can embed both **text and images** (e.g. PDF pages, screenshots, diagrams) into per-token L2-normalized vectors and compute relevance via MaxSim scoring. ColQwen3 specifically uses Qwen3-VL as its vision-language backbone.
|
||||
|
||||
| Architecture | Backbone | Example HF Models |
|
||||
| - | - | - |
|
||||
| `ColQwen3` | Qwen3-VL | `TomoroAI/tomoro-colqwen3-embed-4b`, `TomoroAI/tomoro-colqwen3-embed-8b` |
|
||||
| `OpsColQwen3Model` | Qwen3-VL | `OpenSearch-AI/Ops-Colqwen3-4B`, `OpenSearch-AI/Ops-Colqwen3-8B` |
|
||||
| `Qwen3VLNemotronEmbedModel` | Qwen3-VL | `nvidia/nemotron-colembed-vl-4b-v2`, `nvidia/nemotron-colembed-vl-8b-v2` |
|
||||
|
||||
Start the server:
|
||||
|
||||
```shell
|
||||
vllm serve TomoroAI/tomoro-colqwen3-embed-4b --max-model-len 4096
|
||||
```
|
||||
|
||||
### Text-only scoring and reranking
|
||||
|
||||
Use the `/rerank` API:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{
|
||||
"model": "TomoroAI/tomoro-colqwen3-embed-4b",
|
||||
"query": "What is machine learning?",
|
||||
"documents": [
|
||||
"Machine learning is a subset of artificial intelligence.",
|
||||
"Python is a programming language.",
|
||||
"Deep learning uses neural networks."
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
Or the `/score` API:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{
|
||||
"model": "TomoroAI/tomoro-colqwen3-embed-4b",
|
||||
"text_1": "What is the capital of France?",
|
||||
"text_2": ["The capital of France is Paris.", "Python is a programming language."]
|
||||
}'
|
||||
```
|
||||
|
||||
### Multi-modal scoring and reranking (text query × image documents)
|
||||
|
||||
The `/score` and `/rerank` APIs also accept multi-modal inputs directly.
|
||||
Pass image documents using the `data_1`/`data_2` (for `/score`) or `documents` (for `/rerank`) fields
|
||||
with a `content` list containing `image_url` and `text` parts — the same format used by the
|
||||
OpenAI chat completion API:
|
||||
|
||||
Score a text query against image documents:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{
|
||||
"model": "TomoroAI/tomoro-colqwen3-embed-4b",
|
||||
"data_1": "Retrieve the city of Beijing",
|
||||
"data_2": [
|
||||
{
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64>"}},
|
||||
{"type": "text", "text": "Describe the image."}
|
||||
]
|
||||
}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
Rerank image documents by a text query:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{
|
||||
"model": "TomoroAI/tomoro-colqwen3-embed-4b",
|
||||
"query": "Retrieve the city of Beijing",
|
||||
"documents": [
|
||||
{
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64_1>"}},
|
||||
{"type": "text", "text": "Describe the image."}
|
||||
]
|
||||
},
|
||||
{
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64_2>"}},
|
||||
{"type": "text", "text": "Describe the image."}
|
||||
]
|
||||
}
|
||||
],
|
||||
"top_n": 2
|
||||
}'
|
||||
```
|
||||
|
||||
### Raw token embeddings
|
||||
|
||||
You can also get the raw token embeddings using the `/pooling` API with `token_embed` task:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
|
||||
"model": "TomoroAI/tomoro-colqwen3-embed-4b",
|
||||
"input": "What is machine learning?",
|
||||
"task": "token_embed"
|
||||
}'
|
||||
```
|
||||
|
||||
For **image inputs** via the pooling API, use the chat-style `messages` field:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
|
||||
"model": "TomoroAI/tomoro-colqwen3-embed-4b",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64>"}},
|
||||
{"type": "text", "text": "Describe the image."}
|
||||
]
|
||||
}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
### Examples
|
||||
|
||||
- Multi-vector retrieval: [examples/pooling/token_embed/colqwen3_token_embed_online.py](../../../examples/pooling/token_embed/colqwen3_token_embed_online.py)
|
||||
- Reranking (text + multi-modal): [examples/pooling/score/colqwen3_rerank_online.py](../../../examples/pooling/score/colqwen3_rerank_online.py)
|
||||
|
||||
## ColQwen3.5 Multi-Modal Late Interaction Models
|
||||
|
||||
ColQwen3.5 is based on [ColPali](https://arxiv.org/abs/2407.01449), extending ColBERT's late interaction approach to **multi-modal** inputs. It uses the Qwen3.5 hybrid backbone (linear + full attention) and produces per-token L2-normalized vectors for MaxSim scoring.
|
||||
|
||||
| Architecture | Backbone | Example HF Models |
|
||||
| - | - | - |
|
||||
| `ColQwen3_5` | Qwen3.5 | `athrael-soju/colqwen3.5-4.5B` |
|
||||
|
||||
Start the server:
|
||||
|
||||
```shell
|
||||
vllm serve athrael-soju/colqwen3.5-4.5B --max-model-len 4096
|
||||
```
|
||||
|
||||
Then you can use the rerank endpoint:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{
|
||||
"model": "athrael-soju/colqwen3.5-4.5B",
|
||||
"query": "What is machine learning?",
|
||||
"documents": [
|
||||
"Machine learning is a subset of artificial intelligence.",
|
||||
"Python is a programming language.",
|
||||
"Deep learning uses neural networks."
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
Or the score endpoint:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{
|
||||
"model": "athrael-soju/colqwen3.5-4.5B",
|
||||
"text_1": "What is the capital of France?",
|
||||
"text_2": ["The capital of France is Paris.", "Python is a programming language."]
|
||||
}'
|
||||
```
|
||||
|
||||
An example can be found here: [examples/pooling/score/colqwen3_5_rerank_online.py](../../../examples/pooling/score/colqwen3_5_rerank_online.py)
|
||||
|
||||
## Llama Nemotron Multimodal
|
||||
|
||||
### Embedding Model
|
||||
|
||||
Llama Nemotron VL Embedding models combine the bidirectional Llama embedding backbone
|
||||
(from `nvidia/llama-nemotron-embed-1b-v2`) with SigLIP as the vision encoder to produce
|
||||
single-vector embeddings from text and/or images.
|
||||
|
||||
| Architecture | Backbone | Example HF Models |
|
||||
| - | - | - |
|
||||
| `LlamaNemotronVLModel` | Bidirectional Llama + SigLIP | `nvidia/llama-nemotron-embed-vl-1b-v2` |
|
||||
|
||||
Start the server:
|
||||
|
||||
```shell
|
||||
vllm serve nvidia/llama-nemotron-embed-vl-1b-v2 \
|
||||
--trust-remote-code \
|
||||
--chat-template examples/pooling/embed/template/nemotron_embed_vl.jinja
|
||||
```
|
||||
|
||||
!!! note
|
||||
The chat template bundled with this model's tokenizer is not suitable for
|
||||
the embeddings API. Use the provided override template above when serving
|
||||
with the `messages`-based (chat-style) embeddings API.
|
||||
|
||||
The override template uses the message `role` to automatically prepend the
|
||||
appropriate prefix: set `role` to `"query"` for queries (prepends `query: `)
|
||||
or `"document"` for passages (prepends `passage: `). Any other role omits
|
||||
the prefix.
|
||||
|
||||
Embed text queries:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/v1/embeddings -H "Content-Type: application/json" -d '{
|
||||
"model": "nvidia/llama-nemotron-embed-vl-1b-v2",
|
||||
"messages": [
|
||||
{
|
||||
"role": "query",
|
||||
"content": [
|
||||
{"type": "text", "text": "What is machine learning?"}
|
||||
]
|
||||
}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
Embed images via the chat-style `messages` field:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/v1/embeddings -H "Content-Type: application/json" -d '{
|
||||
"model": "nvidia/llama-nemotron-embed-vl-1b-v2",
|
||||
"messages": [
|
||||
{
|
||||
"role": "document",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64>"}},
|
||||
{"type": "text", "text": "Describe the image."}
|
||||
]
|
||||
}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
### Reranker Model
|
||||
|
||||
Llama Nemotron VL reranker models combine the same bidirectional Llama + SigLIP
|
||||
backbone with a sequence-classification head for cross-encoder scoring and reranking.
|
||||
|
||||
| Architecture | Backbone | Example HF Models |
|
||||
| - | - | - |
|
||||
| `LlamaNemotronVLForSequenceClassification` | Bidirectional Llama + SigLIP | `nvidia/llama-nemotron-rerank-vl-1b-v2` |
|
||||
|
||||
Start the server:
|
||||
|
||||
```shell
|
||||
vllm serve nvidia/llama-nemotron-rerank-vl-1b-v2 \
|
||||
--runner pooling \
|
||||
--trust-remote-code \
|
||||
--chat-template examples/pooling/score/template/nemotron-vl-rerank.jinja
|
||||
```
|
||||
|
||||
!!! note
|
||||
The chat template bundled with this checkpoint's tokenizer is not suitable
|
||||
for the Score/Rerank APIs. Use the provided override template when serving:
|
||||
`examples/pooling/score/template/nemotron-vl-rerank.jinja`.
|
||||
|
||||
Score a text query against an image document:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{
|
||||
"model": "nvidia/llama-nemotron-rerank-vl-1b-v2",
|
||||
"data_1": "Find diagrams about autonomous robots",
|
||||
"data_2": [
|
||||
{
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64>"}},
|
||||
{"type": "text", "text": "Robotics workflow diagram."}
|
||||
]
|
||||
}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
Rerank image documents by a text query:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{
|
||||
"model": "nvidia/llama-nemotron-rerank-vl-1b-v2",
|
||||
"query": "Find diagrams about autonomous robots",
|
||||
"documents": [
|
||||
{
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64_1>"}},
|
||||
{"type": "text", "text": "Robotics workflow diagram."}
|
||||
]
|
||||
},
|
||||
{
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64_2>"}},
|
||||
{"type": "text", "text": "General skyline photo."}
|
||||
]
|
||||
}
|
||||
],
|
||||
"top_n": 2
|
||||
}'
|
||||
```
|
||||
|
||||
## BAAI/bge-m3
|
||||
|
||||
The `BAAI/bge-m3` model comes with extra weights for sparse and colbert embeddings but unfortunately in its `config.json`
|
||||
the architecture is declared as `XLMRobertaModel`, which makes `vLLM` load it as a vanilla ROBERTA model without the
|
||||
extra weights. To load the full model weights, override its architecture like this:
|
||||
|
||||
```shell
|
||||
vllm serve BAAI/bge-m3 --hf-overrides '{"architectures": ["BgeM3EmbeddingModel"]}'
|
||||
```
|
||||
|
||||
Then you obtain the sparse embeddings like this:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
|
||||
"model": "BAAI/bge-m3",
|
||||
"task": "token_classify",
|
||||
"input": ["What is BGE M3?", "Definition of BM25"]
|
||||
}'
|
||||
```
|
||||
|
||||
Due to limitations in the output schema, the output consists of a list of
|
||||
token scores for each token for each input. This means that you'll have to call
|
||||
`/tokenize` as well to be able to pair tokens with scores.
|
||||
Refer to the tests in `tests/models/language/pooling/test_bge_m3.py` to see how
|
||||
to do that.
|
||||
|
||||
You can obtain the colbert embeddings like this:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
|
||||
"model": "BAAI/bge-m3",
|
||||
"task": "token_embed",
|
||||
"input": ["What is BGE M3?", "Definition of BM25"]
|
||||
}'
|
||||
```
|
||||
89
docs/models/pooling_models/token_classify.md
Normal file
89
docs/models/pooling_models/token_classify.md
Normal file
@@ -0,0 +1,89 @@
|
||||
# Token Classification Usages
|
||||
|
||||
## Summary
|
||||
|
||||
- Model Usage: token classification
|
||||
- Pooling Tasks: `token_classify`
|
||||
- Offline APIs:
|
||||
- `LLM.encode(..., pooling_task="token_classify")`
|
||||
- Online APIs:
|
||||
- Pooling API (`/pooling`)
|
||||
|
||||
The key distinction between (sequence) classification and token classification lies in their output granularity: (sequence) classification produces a single result for an entire input sequence, whereas token classification yields a result for each individual token within the sequence.
|
||||
|
||||
Many classification models support both (sequence) classification and token classification. For further details on (sequence) classification, please refer to [this page](classify.md).
|
||||
|
||||
## Typical Use Cases
|
||||
|
||||
### Named Entity Recognition (NER)
|
||||
|
||||
For implementation examples, see:
|
||||
|
||||
Offline: [examples/pooling/token_classify/ner_offline.py](../../../examples/pooling/token_classify/ner_offline.py)
|
||||
|
||||
Online: [examples/pooling/token_classify/ner_online.py](../../../examples/pooling/token_classify/ner_online.py)
|
||||
|
||||
### Sparse retrieval (lexical matching)
|
||||
|
||||
The BAAI/bge-m3 model leverages token classification for sparse retrieval. For more information, see [this page](specific_models.md#baaibge-m3).
|
||||
|
||||
## Supported Models
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) |
|
||||
| ------------ | ------ | ----------------- | --------------------------- | --------------------------------------- |
|
||||
| `BertForTokenClassification` | bert-based | `boltuix/NeuroBERT-NER` (see note), etc. | | |
|
||||
| `ErnieForTokenClassification` | BERT-like Chinese ERNIE | `gyr66/Ernie-3.0-base-chinese-finetuned-ner` | | |
|
||||
| `ModernBertForTokenClassification` | ModernBERT-based | `disham993/electrical-ner-ModernBERT-base` | | |
|
||||
| `Qwen3ForTokenClassification`<sup>C</sup> | Qwen3-based | `bd2lcco/Qwen3-0.6B-finetuned` | | |
|
||||
| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |
|
||||
|
||||
<sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./README.md#model-conversion))
|
||||
\* Feature support is the same as that of the original model.
|
||||
|
||||
If your model is not in the above list, we will try to automatically convert the model using
|
||||
[as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
|
||||
|
||||
### As Reward Models
|
||||
|
||||
Using token classification models as reward models. For details on reward models, see [Reward Models](reward.md).
|
||||
|
||||
--8<-- "docs/models/pooling_models/reward.md:supported-token-reward-models"
|
||||
|
||||
## Offline Inference
|
||||
|
||||
### Pooling Parameters
|
||||
|
||||
The following [pooling parameters][vllm.PoolingParams] are supported.
|
||||
|
||||
```python
|
||||
--8<-- "vllm/pooling_params.py:common-pooling-params"
|
||||
--8<-- "vllm/pooling_params.py:classify-pooling-params"
|
||||
```
|
||||
|
||||
### `LLM.encode`
|
||||
|
||||
The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM.
|
||||
|
||||
Set `pooling_task="token_classify"` when using `LLM.encode` for token classification Models:
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="boltuix/NeuroBERT-NER", runner="pooling")
|
||||
(output,) = llm.encode("Hello, my name is", pooling_task="token_classify")
|
||||
|
||||
data = output.outputs.data
|
||||
print(f"Data: {data!r}")
|
||||
```
|
||||
|
||||
## Online Serving
|
||||
|
||||
Please refer to the [pooling API](README.md#pooling-api) and use `"task":"token_classify"`.
|
||||
|
||||
## More examples
|
||||
|
||||
More examples can be found here: [examples/pooling/token_classify](../../../examples/pooling/token_classify)
|
||||
|
||||
## Supported Features
|
||||
|
||||
Token classification features should be consistent with (sequence) classification. For more information, see [this page](classify.md#supported-features).
|
||||
126
docs/models/pooling_models/token_embed.md
Normal file
126
docs/models/pooling_models/token_embed.md
Normal file
@@ -0,0 +1,126 @@
|
||||
# Token Embedding Usages
|
||||
|
||||
## Summary
|
||||
|
||||
- Model Usage: Token classification models
|
||||
- Pooling Tasks: `token_embed`
|
||||
- Offline APIs:
|
||||
- `LLM.encode(..., pooling_task="token_embed")`
|
||||
- Online APIs:
|
||||
- Pooling API (`/pooling`)
|
||||
|
||||
The difference between the (sequence) embedding task and the token embedding task is that (sequence) embedding outputs one embedding for each sequence, while token embedding outputs a embedding for each token.
|
||||
|
||||
Many embedding models support both (sequence) embedding and token embedding. For further details on (sequence) embedding, please refer to [this page](embed.md).
|
||||
|
||||
## Typical Use Cases
|
||||
|
||||
### Multi-Vector Retrieval
|
||||
|
||||
For implementation examples, see:
|
||||
|
||||
Offline: [examples/pooling/token_embed/multi_vector_retrieval_offline.py](../../../examples/pooling/token_embed/multi_vector_retrieval_offline.py)
|
||||
|
||||
Online: [examples/pooling/token_embed/multi_vector_retrieval_online.py](../../../examples/pooling/token_embed/multi_vector_retrieval_online.py)
|
||||
|
||||
### Late interaction
|
||||
|
||||
Similarity scores can be computed using late interaction between two input prompts via the score API. For more information, see [Score API](scoring.md).
|
||||
|
||||
### Extract last hidden states
|
||||
|
||||
Models of any architecture can be converted into embedding models using `--convert embed`. Token embedding can then be used to extract the last hidden states from these models.
|
||||
|
||||
## Supported Models
|
||||
|
||||
--8<-- [start:supported-token-embed-models]
|
||||
|
||||
### Text-only Models
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) |
|
||||
| ------------ | ------ | ----------------- | -------------------- | ------------------------- |
|
||||
| `ColBERTLfm2Model` | LFM2 | `LiquidAI/LFM2-ColBERT-350M` | | |
|
||||
| `ColBERTModernBertModel` | ModernBERT | `lightonai/GTE-ModernColBERT-v1` | | |
|
||||
| `ColBERTJinaRobertaModel` | Jina XLM-RoBERTa | `jinaai/jina-colbert-v2` | | |
|
||||
| `HF_ColBERT` | BERT | `answerdotai/answerai-colbert-small-v1`, `colbert-ir/colbertv2.0` | | |
|
||||
| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |
|
||||
|
||||
### Multimodal Models
|
||||
|
||||
!!! note
|
||||
For more information about multimodal models inputs, see [this page](../supported_models.md#list-of-multimodal-language-models).
|
||||
|
||||
| Architecture | Models | Inputs | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) |
|
||||
| ------------ | ------ | ----- | ----------------- | ------------------------------ | ------------------------------------------ |
|
||||
| `ColModernVBertForRetrieval` | ColModernVBERT | T / I | `ModernVBERT/colmodernvbert-merged` | | |
|
||||
| `ColPaliForRetrieval` | ColPali | T / I | `vidore/colpali-v1.3-hf` | | |
|
||||
| `ColQwen3` | Qwen3-VL | T / I | `TomoroAI/tomoro-colqwen3-embed-4b`, `TomoroAI/tomoro-colqwen3-embed-8b` | | |
|
||||
| `ColQwen3_5` | ColQwen3.5 | T + I + V | `athrael-soju/colqwen3.5-4.5B-v3` | | |
|
||||
| `OpsColQwen3Model` | Qwen3-VL | T / I | `OpenSearch-AI/Ops-Colqwen3-4B`, `OpenSearch-AI/Ops-Colqwen3-8B` | | |
|
||||
| `Qwen3VLNemotronEmbedModel` | Qwen3-VL | T / I | `nvidia/nemotron-colembed-vl-4b-v2`, `nvidia/nemotron-colembed-vl-8b-v2` | ✅︎ | ✅︎ |
|
||||
| `*ForConditionalGeneration`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | \* | N/A | \* | \* |
|
||||
|
||||
<sup>C</sup> Automatically converted into an embedding model via `--convert embed`. ([details](./README.md#model-conversion))
|
||||
\* Feature support is the same as that of the original model.
|
||||
|
||||
If your model is not in the above list, we will try to automatically convert the model using [as_embedding_model][vllm.model_executor.models.adapters.as_embedding_model].
|
||||
|
||||
--8<-- [end:supported-token-embed-models]
|
||||
|
||||
## Offline Inference
|
||||
|
||||
### Pooling Parameters
|
||||
|
||||
The following [pooling parameters][vllm.PoolingParams] are supported.
|
||||
|
||||
```python
|
||||
--8<-- "vllm/pooling_params.py:common-pooling-params"
|
||||
--8<-- "vllm/pooling_params.py:embed-pooling-params"
|
||||
```
|
||||
|
||||
### `LLM.encode`
|
||||
|
||||
The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM.
|
||||
|
||||
Set `pooling_task="token_embed"` when using `LLM.encode` for token embedding Models:
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="answerdotai/answerai-colbert-small-v1", runner="pooling")
|
||||
(output,) = llm.encode("Hello, my name is", pooling_task="token_embed")
|
||||
|
||||
data = output.outputs.data
|
||||
print(f"Data: {data!r}")
|
||||
```
|
||||
|
||||
### `LLM.score`
|
||||
|
||||
The [score][vllm.LLM.score] method outputs similarity scores between sentence pairs.
|
||||
|
||||
All models that support token embedding task also support using the score API to compute similarity scores by calculating the late interaction of two input prompts.
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="answerdotai/answerai-colbert-small-v1", runner="pooling")
|
||||
(output,) = llm.score(
|
||||
"What is the capital of France?",
|
||||
"The capital of Brazil is Brasilia.",
|
||||
)
|
||||
|
||||
score = output.outputs.score
|
||||
print(f"Score: {score}")
|
||||
```
|
||||
|
||||
## Online Serving
|
||||
|
||||
Please refer to the [pooling API](README.md#pooling-api) and use `"task":"token_embed"`.
|
||||
|
||||
## More examples
|
||||
|
||||
More examples can be found here: [examples/pooling/token_embed](../../../examples/pooling/token_embed)
|
||||
|
||||
## Supported Features
|
||||
|
||||
Token embedding features should be consistent with (sequence) embedding. For more information, see [this page](embed.md#supported-features).
|
||||
@@ -1,6 +1,6 @@
|
||||
# Supported Models
|
||||
|
||||
vLLM supports [generative](./generative_models.md) and [pooling](./pooling_models.md) models across various tasks.
|
||||
vLLM supports [generative](./generative_models.md) and [pooling](./pooling_models/README.md) models across various tasks.
|
||||
|
||||
For each task, we list the model architectures that have been implemented in vLLM.
|
||||
Alongside each architecture, we include some popular models that use it.
|
||||
@@ -499,156 +499,6 @@ Some models are supported only via the [Transformers modeling backend](#transfor
|
||||
!!! note
|
||||
Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
|
||||
|
||||
### Pooling Models
|
||||
|
||||
See [this page](./pooling_models.md) for more information on how to use pooling models.
|
||||
|
||||
!!! important
|
||||
Since some model architectures support both generative and pooling tasks,
|
||||
you should explicitly specify `--runner pooling` to ensure that the model is used in pooling mode instead of generative mode.
|
||||
|
||||
#### Embedding
|
||||
|
||||
These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) API.
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|
||||
| ------------ | ------ | ----------------- | -------------------- | ------------------------- |
|
||||
| `BertModel`<sup>C</sup> | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | |
|
||||
| `BertSpladeSparseEmbeddingModel` | SPLADE | `naver/splade-v3` | | |
|
||||
| `ErnieModel` | BERT-like Chinese ERNIE | `shibing624/text2vec-base-chinese-sentence` | | |
|
||||
| `Gemma2Model`<sup>C</sup> | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | ✅︎ |
|
||||
| `Gemma3TextModel`<sup>C</sup> | Gemma 3-based | `google/embeddinggemma-300m`, etc. | ✅︎ | ✅︎ |
|
||||
| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ |
|
||||
| `GteModel`<sup>C</sup> | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. | | |
|
||||
| `GteNewModel`<sup>C</sup> | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | | |
|
||||
| `ModernBertModel`<sup>C</sup> | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | | |
|
||||
| `NomicBertModel`<sup>C</sup> | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | | |
|
||||
| `LlamaBidirectionalModel`<sup>C</sup> | Llama-based with bidirectional attention | `nvidia/llama-nemotron-embed-1b-v2`, etc. | ✅︎ | ✅︎ |
|
||||
| `LlamaModel`<sup>C</sup>, `LlamaForCausalLM`<sup>C</sup>, `MistralModel`<sup>C</sup>, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen2Model`<sup>C</sup>, `Qwen2ForCausalLM`<sup>C</sup> | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen3Model`<sup>C</sup>, `Qwen3ForCausalLM`<sup>C</sup> | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ |
|
||||
| `VoyageQwen3BidirectionalEmbedModel`<sup>C</sup> | Voyage Qwen3-based with bidirectional attention | `voyageai/voyage-4-nano`, etc. | ✅︎ | ✅︎ |
|
||||
| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | |
|
||||
| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |
|
||||
|
||||
<sup>C</sup> Automatically converted into an embedding model via `--convert embed`. ([details](./pooling_models.md#model-conversion))
|
||||
\* Feature support is the same as that of the original model.
|
||||
|
||||
!!! note
|
||||
`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
|
||||
You need to manually set mean pooling by passing `--pooler-config '{"pooling_type": "MEAN"}'`.
|
||||
|
||||
!!! note
|
||||
For `Alibaba-NLP/gte-Qwen2-*`, you need to enable `--trust-remote-code` for the correct tokenizer to be loaded.
|
||||
See [relevant issue on HF Transformers](https://github.com/huggingface/transformers/issues/34882).
|
||||
|
||||
!!! note
|
||||
`jinaai/jina-embeddings-v3` supports multiple tasks through LoRA, while vllm temporarily only supports text-matching tasks by merging LoRA weights.
|
||||
|
||||
!!! note
|
||||
The second-generation GTE model (mGTE-TRM) is named `NewModel`. The name `NewModel` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewModel"]}'` to specify the use of the `GteNewModel` architecture.
|
||||
|
||||
If your model is not in the above list, we will try to automatically convert the model using
|
||||
[as_embedding_model][vllm.model_executor.models.adapters.as_embedding_model]. By default, the embeddings
|
||||
of the whole prompt are extracted from the normalized hidden state corresponding to the last token.
|
||||
|
||||
#### Classification
|
||||
|
||||
These models primarily support the [`LLM.classify`](./pooling_models.md#llmclassify) API.
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|
||||
| ------------ | ------ | ----------------- | -------------------- | ------------------------- |
|
||||
| `ErnieForSequenceClassification` | BERT-like Chinese ERNIE | `Forrest20231206/ernie-3.0-base-zh-cls` | | |
|
||||
| `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | |
|
||||
| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ |
|
||||
| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |
|
||||
|
||||
<sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion))
|
||||
\* Feature support is the same as that of the original model.
|
||||
|
||||
If your model is not in the above list, we will try to automatically convert the model using
|
||||
[as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
|
||||
|
||||
#### Cross-encoder / Reranker
|
||||
|
||||
Cross-encoder and reranker models are a subset of classification models that accept two prompts as input.
|
||||
These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) API.
|
||||
|
||||
| Architecture | Models | Example HF Models | Score template (see note) | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|
||||
| ------------ | ------ | ----------------- | ------------------------- | --------------------------- | --------------------------------------- |
|
||||
| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | N/A | | |
|
||||
| `ErnieForSequenceClassification` | BERT-like Chinese ERNIE | `Forrest20231206/ernie-3.0-base-zh-cls` | N/A | | |
|
||||
| `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma`(see note), etc. | [bge-reranker-v2-gemma.jinja](../../examples/pooling/score/template/bge-reranker-v2-gemma.jinja) | ✅︎ | ✅︎ |
|
||||
| `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. | N/A | | |
|
||||
| `LlamaBidirectionalForSequenceClassification`<sup>C</sup> | Llama-based with bidirectional attention | `nvidia/llama-nemotron-rerank-1b-v2`, etc. | [nemotron-rerank.jinja](../../examples/pooling/score/template/nemotron-rerank.jinja) | ✅︎ | ✅︎ |
|
||||
| `Qwen2ForSequenceClassification`<sup>C</sup> | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2`(see note), etc. | [mxbai_rerank_v2.jinja](../../examples/pooling/score/template/mxbai_rerank_v2.jinja) | ✅︎ | ✅︎ |
|
||||
| `Qwen3ForSequenceClassification`<sup>C</sup> | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B`(see note), etc. | [qwen3_reranker.jinja](../../examples/pooling/score/template/qwen3_reranker.jinja) | ✅︎ | ✅︎ |
|
||||
| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | N/A | | |
|
||||
| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | N/A | | |
|
||||
| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | N/A | \* | \* |
|
||||
|
||||
<sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion))
|
||||
\* Feature support is the same as that of the original model.
|
||||
|
||||
!!! note
|
||||
Some models require a specific prompt format to work correctly.
|
||||
|
||||
You can find Example HF Models's corresponding score template in [examples/pooling/score/template/](../../examples/pooling/score/template)
|
||||
|
||||
Examples : [examples/pooling/score/using_template_offline.py](../../examples/pooling/score/using_template_offline.py) [examples/pooling/score/using_template_online.py](../../examples/pooling/score/using_template_online.py)
|
||||
|
||||
!!! note
|
||||
Load the official original `BAAI/bge-reranker-v2-gemma` by using the following command.
|
||||
|
||||
```bash
|
||||
vllm serve BAAI/bge-reranker-v2-gemma --hf_overrides '{"architectures": ["GemmaForSequenceClassification"],"classifier_from_token": ["Yes"],"method": "no_post_processing"}'
|
||||
```
|
||||
|
||||
!!! note
|
||||
The second-generation GTE model (mGTE-TRM) is named `NewForSequenceClassification`. The name `NewForSequenceClassification` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewForSequenceClassification"]}'` to specify the use of the `GteNewForSequenceClassification` architecture.
|
||||
|
||||
!!! note
|
||||
Load the official original `mxbai-rerank-v2` by using the following command.
|
||||
|
||||
```bash
|
||||
vllm serve mixedbread-ai/mxbai-rerank-base-v2 --hf_overrides '{"architectures": ["Qwen2ForSequenceClassification"],"classifier_from_token": ["0", "1"], "method": "from_2_way_softmax"}'
|
||||
```
|
||||
|
||||
!!! note
|
||||
Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/pooling/score/qwen3_reranker_offline.py](../../examples/pooling/score/qwen3_reranker_offline.py) [examples/pooling/score/qwen3_reranker_online.py](../../examples/pooling/score/qwen3_reranker_online.py).
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
|
||||
```
|
||||
|
||||
#### Reward Modeling
|
||||
|
||||
These models primarily support the [`LLM.reward`](./pooling_models.md#llmreward) API.
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|
||||
| ------------ | ------ | ----------------- | -------------------- | ------------------------- |
|
||||
| `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ |
|
||||
| `LlamaForCausalLM` | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B`, etc. | ✅︎ | ✅︎ |
|
||||
|
||||
!!! important
|
||||
For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
|
||||
e.g.: `--pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
|
||||
|
||||
#### Token Classification
|
||||
|
||||
These models primarily support the [`LLM.encode`](./pooling_models.md#llmencode) API.
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|
||||
| ------------ | ------ | ----------------- | --------------------------- | --------------------------------------- |
|
||||
| `BertForTokenClassification` | bert-based | `boltuix/NeuroBERT-NER` (see note), etc. | | |
|
||||
| `ErnieForTokenClassification` | BERT-like Chinese ERNIE | `gyr66/Ernie-3.0-base-chinese-finetuned-ner` | | |
|
||||
| `ModernBertForTokenClassification` | ModernBERT-based | `disham993/electrical-ner-ModernBERT-base` | | |
|
||||
|
||||
!!! note
|
||||
Named Entity Recognition (NER) usage, please refer to [examples/pooling/token_classify/ner_offline.py](../../examples/pooling/token_classify/ner_offline.py), [examples/pooling/token_classify/ner_online.py](../../examples/pooling/token_classify/ner_online.py).
|
||||
|
||||
## List of Multimodal Language Models
|
||||
|
||||
The following modalities are supported depending on the model:
|
||||
@@ -707,7 +557,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
||||
| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ |
|
||||
| `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | T + I<sup>+</sup> + V<sup>+</sup> | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | |
|
||||
| `HCXVisionV2ForCausalLM` | HyperCLOVAX-SEED-Think-32B | T + I<sup>+</sup> + V<sup>+</sup> | `naver-hyperclovax/HyperCLOVAX-SEED-Think-32B` | | |
|
||||
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ |
|
||||
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | ✅︎ | ✅︎ |
|
||||
| `HunYuanVLForConditionalGeneration` | HunyuanOCR | T + I<sup>E+</sup> | `tencent/HunyuanOCR`, etc. | ✅︎ | ✅︎ |
|
||||
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | |
|
||||
| `IsaacForConditionalGeneration` | Isaac | T + I<sup>+</sup> | `PerceptronAI/Isaac-0.1` | ✅︎ | ✅︎ |
|
||||
@@ -816,56 +666,23 @@ Speech2Text models trained specifically for Automatic Speech Recognition.
|
||||
!!! note
|
||||
`VoxtralForConditionalGeneration` requires `mistral-common[audio]` to be installed.
|
||||
|
||||
### Pooling Models
|
||||
## Pooling Models
|
||||
|
||||
See [this page](./pooling_models.md) for more information on how to use pooling models.
|
||||
See [this page](pooling_models/README.md) for more information on how to use pooling models.
|
||||
|
||||
#### Embedding
|
||||
!!! important
|
||||
Since some model architectures support both generative and pooling tasks,
|
||||
you should explicitly specify `--runner pooling` to ensure that the model is used in pooling mode instead of generative mode.
|
||||
|
||||
These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) API.
|
||||
See the link below for more information on the models supported for specific pooling tasks.
|
||||
|
||||
!!! note
|
||||
To get the best results, you should use pooling models that are specifically trained as such.
|
||||
|
||||
The following table lists those that are tested in vLLM.
|
||||
|
||||
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|
||||
| ------------ | ------ | ------ | ----------------- | -------------------- | ------------------------- |
|
||||
| `CLIPModel` | CLIP | T / I | `openai/clip-vit-base-patch32`, `openai/clip-vit-large-patch14`, etc. | | |
|
||||
| `ColModernVBertForRetrieval` | ColModernVBERT | T / I | `ModernVBERT/colmodernvbert-merged` | | |
|
||||
| `ColPaliForRetrieval` | ColPali | T / I | `vidore/colpali-v1.3-hf` | | |
|
||||
| `LlamaNemotronVLModel` | Llama Nemotron Embedding + SigLIP | T + I | `nvidia/llama-nemotron-embed-vl-1b-v2` | | |
|
||||
| `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ |
|
||||
| `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ |
|
||||
| `Qwen3VLForConditionalGeneration`<sup>C</sup> | Qwen3-VL | T + I + V | `Qwen/Qwen3-VL-Embedding-2B`, etc. | ✅︎ | ✅︎ |
|
||||
| `SiglipModel` | SigLIP, SigLIP2 | T / I | `google/siglip-base-patch16-224`, `google/siglip2-base-patch16-224` | | |
|
||||
| `*ForConditionalGeneration`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | \* | N/A | \* | \* |
|
||||
|
||||
<sup>C</sup> Automatically converted into an embedding model via `--convert embed`. ([details](./pooling_models.md#model-conversion))
|
||||
\* Feature support is the same as that of the original model.
|
||||
|
||||
---
|
||||
|
||||
#### Cross-encoder / Reranker
|
||||
|
||||
Cross-encoder and reranker models are a subset of classification models that accept two prompts as input.
|
||||
These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) API.
|
||||
|
||||
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|
||||
| ------------ | ------ | ------ | ----------------- | -------------------- | ------------------------- |
|
||||
| `JinaVLForSequenceClassification` | JinaVL-based | T + I<sup>E+</sup> | `jinaai/jina-reranker-m0`, etc. | ✅︎ | ✅︎ |
|
||||
| `LlamaNemotronVLForSequenceClassification` | Llama Nemotron Reranker + SigLIP | T + I<sup>E+</sup> | `nvidia/llama-nemotron-rerank-vl-1b-v2` | | |
|
||||
| `Qwen3VLForSequenceClassification` | Qwen3-VL-Reranker | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-Reranker-2B`(see note), etc. | ✅︎ | ✅︎ |
|
||||
|
||||
<sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion))
|
||||
\* Feature support is the same as that of the original model.
|
||||
|
||||
!!! note
|
||||
Similar to Qwen3-Reranker, you need to use the following `--hf_overrides` to load the official original `Qwen3-VL-Reranker`.
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/Qwen3-VL-Reranker-2B --hf_overrides '{"architectures": ["Qwen3VLForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
|
||||
```
|
||||
- [Classification Usages](pooling_models/classify.md)
|
||||
- [Embedding Usages](pooling_models/embed.md)
|
||||
- [Reward Usages](pooling_models/reward.md)
|
||||
- [Token Classification Usages](pooling_models/token_classify.md)
|
||||
- [Token Embedding Usages](pooling_models/token_embed.md)
|
||||
- [Scoring Usages](pooling_models/scoring.md)
|
||||
- [Specific Model Examples](pooling_models/specific_models.md)
|
||||
|
||||
## Model Support Policy
|
||||
|
||||
|
||||
@@ -23,7 +23,6 @@ vLLM provides multiple communication backends for EP. Use `--all2all-backend` to
|
||||
| `deepep_low_latency` | Multi-node decode | CUDA graph support, masked layout, optimized for decode | Decode-dominated workloads, low-latency scenarios |
|
||||
| `flashinfer_nvlink_one_sided` | MNNVL systems | FlashInfer's one-sided A2A strategy for multi-node NVLink | High-throughput workloads |
|
||||
| `flashinfer_nvlink_two_sided` | MNNVL systems | FlashInfer's two-sided A2A strategy for multi-node NVLink | Systems with NVLink across nodes |
|
||||
| `naive` | Testing/debugging | Simple broadcast-based implementation | Debugging, not recommended for production |
|
||||
|
||||
## Single Node Deployment
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@ After initializing the `LLM` instance, use the available APIs to perform model i
|
||||
The available APIs depend on the model type:
|
||||
|
||||
- [Generative models](../models/generative_models.md) output logprobs which are sampled from to obtain the final output text.
|
||||
- [Pooling models](../models/pooling_models.md) output their hidden states directly.
|
||||
- [Pooling models](../models/pooling_models/README.md) output their hidden states directly.
|
||||
|
||||
!!! info
|
||||
[API Reference](../api/README.md#offline-inference)
|
||||
|
||||
@@ -53,8 +53,8 @@ We currently support the following OpenAI APIs:
|
||||
- Only applicable to [text generation models](../models/generative_models.md) with a [chat template](../serving/openai_compatible_server.md#chat-template).
|
||||
- *Note: `user` parameter is ignored.*
|
||||
- *Note:* Setting the `parallel_tool_calls` parameter to `false` ensures vLLM only returns zero or one tool call per request. Setting it to `true` (the default) allows returning more than one tool call per request. There is no guarantee more than one tool call will be returned if this is set to `true`, as that behavior is model dependent and not all models are designed to support parallel tool calls.
|
||||
- [Embeddings API](#embeddings-api) (`/v1/embeddings`)
|
||||
- Only applicable to [embedding models](../models/pooling_models.md).
|
||||
- [Embeddings API](../models/pooling_models/embed.md#openai-compatible-embeddings-api) (`/v1/embeddings`)
|
||||
- Only applicable to [embedding models](../models/pooling_models/embed.md).
|
||||
- [Transcriptions API](#transcriptions-api) (`/v1/audio/transcriptions`)
|
||||
- Only applicable to [Automatic Speech Recognition (ASR) models](../models/supported_models.md#transcription).
|
||||
- [Translation API](#translations-api) (`/v1/audio/translations`)
|
||||
@@ -66,17 +66,19 @@ In addition, we have the following custom APIs:
|
||||
|
||||
- [Tokenizer API](#tokenizer-api) (`/tokenize`, `/detokenize`)
|
||||
- Applicable to any model with a tokenizer.
|
||||
- [Pooling API](#pooling-api) (`/pooling`)
|
||||
- Applicable to all [pooling models](../models/pooling_models.md).
|
||||
- [Classification API](#classification-api) (`/classify`)
|
||||
- Only applicable to [classification models](../models/pooling_models.md).
|
||||
- [Score API](#score-api) (`/score`)
|
||||
- Applicable to [embedding models and cross-encoder models](../models/pooling_models.md).
|
||||
- [Re-rank API](#re-rank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`)
|
||||
- Implements [Jina AI's v1 re-rank API](https://jina.ai/reranker/)
|
||||
- Also compatible with [Cohere's v1 & v2 re-rank APIs](https://docs.cohere.com/v2/reference/rerank)
|
||||
- [pooling API](../models/pooling_models/README.md#pooling-api) (`/pooling`)
|
||||
- Applicable to all [pooling models](../models/pooling_models/README.md).
|
||||
- [Classification API](../models/pooling_models/classify.md#classification-api) (`/classify`)
|
||||
- Only applicable to [classification models](../models/pooling_models/classify.md).
|
||||
- [Cohere Embed API](../models/pooling_models/embed.md#cohere-embed-api) (`/v2/embed`)
|
||||
- Compatible with [Cohere's Embed API](https://docs.cohere.com/reference/embed)
|
||||
- Works with any [embedding model](../models/pooling_models/embed.md#supported-models), including multimodal models.
|
||||
- [Score API](../models/pooling_models/scoring.md#score-api) (`/score`)
|
||||
- Applicable to [score models](../models/pooling_models/scoring.md).
|
||||
- [Rerank API](../models/pooling_models/scoring.md#rerank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`)
|
||||
- Implements [Jina AI's v1 rerank API](https://jina.ai/reranker/)
|
||||
- Also compatible with [Cohere's v1 & v2 rerank APIs](https://docs.cohere.com/v2/reference/rerank)
|
||||
- Jina and Cohere's APIs are very similar; Jina's includes extra information in the rerank endpoint's response.
|
||||
- Only applicable to [cross-encoder models](../models/pooling_models.md).
|
||||
|
||||
## Chat Template
|
||||
|
||||
@@ -266,169 +268,6 @@ The following extra parameters in the response object are supported:
|
||||
--8<-- "vllm/entrypoints/openai/responses/protocol.py:responses-response-extra-params"
|
||||
```
|
||||
|
||||
### Embeddings API
|
||||
|
||||
Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings);
|
||||
you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
|
||||
|
||||
Code example: [examples/pooling/embed/openai_embedding_client.py](../../examples/pooling/embed/openai_embedding_client.py)
|
||||
|
||||
If the model has a [chat template](../serving/openai_compatible_server.md#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat API](#chat-api))
|
||||
which will be treated as a single prompt to the model. Here is a convenience function for calling the API while retaining OpenAI's type annotations:
|
||||
|
||||
??? code
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
from openai._types import NOT_GIVEN, NotGiven
|
||||
from openai.types.chat import ChatCompletionMessageParam
|
||||
from openai.types.create_embedding_response import CreateEmbeddingResponse
|
||||
|
||||
def create_chat_embeddings(
|
||||
client: OpenAI,
|
||||
*,
|
||||
messages: list[ChatCompletionMessageParam],
|
||||
model: str,
|
||||
encoding_format: Union[Literal["base64", "float"], NotGiven] = NOT_GIVEN,
|
||||
) -> CreateEmbeddingResponse:
|
||||
return client.post(
|
||||
"/embeddings",
|
||||
cast_to=CreateEmbeddingResponse,
|
||||
body={"messages": messages, "model": model, "encoding_format": encoding_format},
|
||||
)
|
||||
```
|
||||
|
||||
#### Multi-modal inputs
|
||||
|
||||
You can pass multi-modal inputs to embedding models by defining a custom chat template for the server
|
||||
and passing a list of `messages` in the request. Refer to the examples below for illustration.
|
||||
|
||||
=== "VLM2Vec"
|
||||
|
||||
To serve the model:
|
||||
|
||||
```bash
|
||||
vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling \
|
||||
--trust-remote-code \
|
||||
--max-model-len 4096 \
|
||||
--chat-template examples/pooling/embed/template/vlm2vec_phi3v.jinja
|
||||
```
|
||||
|
||||
!!! important
|
||||
Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--runner pooling`
|
||||
to run this model in embedding mode instead of text generation mode.
|
||||
|
||||
The custom chat template is completely different from the original one for this model,
|
||||
and can be found here: [examples/pooling/embed/template/vlm2vec_phi3v.jinja](../../examples/pooling/embed/template/vlm2vec_phi3v.jinja)
|
||||
|
||||
Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
|
||||
|
||||
??? code
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
client = OpenAI(
|
||||
base_url="http://localhost:8000/v1",
|
||||
api_key="EMPTY",
|
||||
)
|
||||
image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||
|
||||
response = create_chat_embeddings(
|
||||
client,
|
||||
model="TIGER-Lab/VLM2Vec-Full",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
{"type": "text", "text": "Represent the given image."},
|
||||
],
|
||||
}
|
||||
],
|
||||
encoding_format="float",
|
||||
)
|
||||
|
||||
print("Image embedding output:", response.data[0].embedding)
|
||||
```
|
||||
|
||||
=== "DSE-Qwen2-MRL"
|
||||
|
||||
To serve the model:
|
||||
|
||||
```bash
|
||||
vllm serve MrLight/dse-qwen2-2b-mrl-v1 --runner pooling \
|
||||
--trust-remote-code \
|
||||
--max-model-len 8192 \
|
||||
--chat-template examples/pooling/embed/template/dse_qwen2_vl.jinja
|
||||
```
|
||||
|
||||
!!! important
|
||||
Like with VLM2Vec, we have to explicitly pass `--runner pooling`.
|
||||
|
||||
Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled
|
||||
by a custom chat template: [examples/pooling/embed/template/dse_qwen2_vl.jinja](../../examples/pooling/embed/template/dse_qwen2_vl.jinja)
|
||||
|
||||
!!! important
|
||||
`MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
|
||||
example below for details.
|
||||
|
||||
Full example: [examples/pooling/embed/vision_embedding_online.py](../../examples/pooling/embed/vision_embedding_online.py)
|
||||
|
||||
#### Extra parameters
|
||||
|
||||
The following [pooling parameters][vllm.PoolingParams] are supported.
|
||||
|
||||
```python
|
||||
--8<-- "vllm/pooling_params.py:common-pooling-params"
|
||||
--8<-- "vllm/pooling_params.py:embed-pooling-params"
|
||||
```
|
||||
|
||||
The following Embeddings API parameters are supported:
|
||||
|
||||
??? code
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:completion-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:encoding-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:embed-params"
|
||||
```
|
||||
|
||||
The following extra parameters are supported:
|
||||
|
||||
??? code
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:completion-extra-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:encoding-extra-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:embed-extra-params"
|
||||
```
|
||||
|
||||
For chat-like input (i.e. if `messages` is passed), the following parameters are supported:
|
||||
|
||||
The following parameters are supported by default:
|
||||
|
||||
??? code
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:chat-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:encoding-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:embed-params"
|
||||
```
|
||||
|
||||
these extra parameters are supported instead:
|
||||
|
||||
??? code
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:chat-extra-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:encoding-extra-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:embed-extra-params"
|
||||
```
|
||||
|
||||
### Transcriptions API
|
||||
|
||||
Our Transcriptions API is compatible with [OpenAI's Transcriptions API](https://platform.openai.com/docs/api-reference/audio/createTranscription);
|
||||
@@ -625,172 +464,8 @@ It consists of two endpoints:
|
||||
- `/tokenize` corresponds to calling `tokenizer.encode()`.
|
||||
- `/detokenize` corresponds to calling `tokenizer.decode()`.
|
||||
|
||||
### Pooling API
|
||||
|
||||
Our Pooling API encodes input prompts using a [pooling model](../models/pooling_models.md) and returns the corresponding hidden states.
|
||||
|
||||
The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats.
|
||||
|
||||
Code example: [examples/pooling/pooling/pooling_online.py](../../examples/pooling/pooling/pooling_online.py)
|
||||
|
||||
### Classification API
|
||||
|
||||
Our Classification API directly supports Hugging Face sequence-classification models such as [ai21labs/Jamba-tiny-reward-dev](https://huggingface.co/ai21labs/Jamba-tiny-reward-dev) and [jason9693/Qwen2.5-1.5B-apeach](https://huggingface.co/jason9693/Qwen2.5-1.5B-apeach).
|
||||
|
||||
We automatically wrap any other transformer via `as_seq_cls_model()`, which pools on the last token, attaches a `RowParallelLinear` head, and applies a softmax to produce per-class probabilities.
|
||||
|
||||
Code example: [examples/pooling/classify/classification_online.py](../../examples/pooling/classify/classification_online.py)
|
||||
|
||||
#### Example Requests
|
||||
|
||||
You can classify multiple texts by passing an array of strings:
|
||||
|
||||
```bash
|
||||
curl -v "http://127.0.0.1:8000/classify" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "jason9693/Qwen2.5-1.5B-apeach",
|
||||
"input": [
|
||||
"Loved the new café—coffee was great.",
|
||||
"This update broke everything. Frustrating."
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
??? console "Response"
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "classify-7c87cac407b749a6935d8c7ce2a8fba2",
|
||||
"object": "list",
|
||||
"created": 1745383065,
|
||||
"model": "jason9693/Qwen2.5-1.5B-apeach",
|
||||
"data": [
|
||||
{
|
||||
"index": 0,
|
||||
"label": "Default",
|
||||
"probs": [
|
||||
0.565970778465271,
|
||||
0.4340292513370514
|
||||
],
|
||||
"num_classes": 2
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"label": "Spoiled",
|
||||
"probs": [
|
||||
0.26448777318000793,
|
||||
0.7355121970176697
|
||||
],
|
||||
"num_classes": 2
|
||||
}
|
||||
],
|
||||
"usage": {
|
||||
"prompt_tokens": 20,
|
||||
"total_tokens": 20,
|
||||
"completion_tokens": 0,
|
||||
"prompt_tokens_details": null
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
You can also pass a string directly to the `input` field:
|
||||
|
||||
```bash
|
||||
curl -v "http://127.0.0.1:8000/classify" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "jason9693/Qwen2.5-1.5B-apeach",
|
||||
"input": "Loved the new café—coffee was great."
|
||||
}'
|
||||
```
|
||||
|
||||
??? console "Response"
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "classify-9bf17f2847b046c7b2d5495f4b4f9682",
|
||||
"object": "list",
|
||||
"created": 1745383213,
|
||||
"model": "jason9693/Qwen2.5-1.5B-apeach",
|
||||
"data": [
|
||||
{
|
||||
"index": 0,
|
||||
"label": "Default",
|
||||
"probs": [
|
||||
0.565970778465271,
|
||||
0.4340292513370514
|
||||
],
|
||||
"num_classes": 2
|
||||
}
|
||||
],
|
||||
"usage": {
|
||||
"prompt_tokens": 10,
|
||||
"total_tokens": 10,
|
||||
"completion_tokens": 0,
|
||||
"prompt_tokens_details": null
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### Extra parameters
|
||||
|
||||
The following [pooling parameters][vllm.PoolingParams] are supported.
|
||||
|
||||
```python
|
||||
--8<-- "vllm/pooling_params.py:common-pooling-params"
|
||||
--8<-- "vllm/pooling_params.py:classify-pooling-params"
|
||||
```
|
||||
|
||||
The following Classification API parameters are supported:
|
||||
|
||||
??? code
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:completion-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-params"
|
||||
```
|
||||
|
||||
The following extra parameters are supported:
|
||||
|
||||
??? code
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:completion-extra-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params"
|
||||
```
|
||||
|
||||
For chat-like input (i.e. if `messages` is passed), the following parameters are supported:
|
||||
|
||||
??? code
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:chat-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-params"
|
||||
```
|
||||
|
||||
these extra parameters are supported instead:
|
||||
|
||||
??? code
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:chat-extra-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params"
|
||||
```
|
||||
|
||||
### Score API
|
||||
|
||||
Our Score API can apply a cross-encoder model or an embedding model to predict scores for sentence or multimodal pairs. When using an embedding model the score corresponds to the cosine similarity between each embedding pair.
|
||||
Usually, the score for a sentence pair refers to the similarity between two sentences, on a scale of 0 to 1.
|
||||
|
||||
You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
|
||||
|
||||
Code example: [examples/pooling/score/score_api_online.py](../../examples/pooling/score/score_api_online.py)
|
||||
|
||||
#### Score Template
|
||||
|
||||
Some scoring models require a specific prompt format to work correctly. You can specify a custom score template using the `--chat-template` parameter (see [Chat Template](#chat-template)).
|
||||
@@ -806,307 +481,6 @@ This approach is more robust than index-based access (`messages[0]`, `messages[1
|
||||
|
||||
Example template file: [examples/pooling/score/template/nemotron-rerank.jinja](../../examples/pooling/score/template/nemotron-rerank.jinja)
|
||||
|
||||
#### Single inference
|
||||
|
||||
You can pass a string to both `queries` and `documents`, forming a single sentence pair.
|
||||
|
||||
```bash
|
||||
curl -X 'POST' \
|
||||
'http://127.0.0.1:8000/score' \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"model": "BAAI/bge-reranker-v2-m3",
|
||||
"encoding_format": "float",
|
||||
"queries": "What is the capital of France?",
|
||||
"documents": "The capital of France is Paris."
|
||||
}'
|
||||
```
|
||||
|
||||
??? console "Response"
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "score-request-id",
|
||||
"object": "list",
|
||||
"created": 693447,
|
||||
"model": "BAAI/bge-reranker-v2-m3",
|
||||
"data": [
|
||||
{
|
||||
"index": 0,
|
||||
"object": "score",
|
||||
"score": 1
|
||||
}
|
||||
],
|
||||
"usage": {}
|
||||
}
|
||||
```
|
||||
|
||||
#### Batch inference
|
||||
|
||||
You can pass a string to `queries` and a list to `documents`, forming multiple sentence pairs
|
||||
where each pair is built from `queries` and a string in `documents`.
|
||||
The total number of pairs is `len(documents)`.
|
||||
|
||||
??? console "Request"
|
||||
|
||||
```bash
|
||||
curl -X 'POST' \
|
||||
'http://127.0.0.1:8000/score' \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"model": "BAAI/bge-reranker-v2-m3",
|
||||
"queries": "What is the capital of France?",
|
||||
"documents": [
|
||||
"The capital of Brazil is Brasilia.",
|
||||
"The capital of France is Paris."
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
??? console "Response"
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "score-request-id",
|
||||
"object": "list",
|
||||
"created": 693570,
|
||||
"model": "BAAI/bge-reranker-v2-m3",
|
||||
"data": [
|
||||
{
|
||||
"index": 0,
|
||||
"object": "score",
|
||||
"score": 0.001094818115234375
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"object": "score",
|
||||
"score": 1
|
||||
}
|
||||
],
|
||||
"usage": {}
|
||||
}
|
||||
```
|
||||
|
||||
You can pass a list to both `queries` and `documents`, forming multiple sentence pairs
|
||||
where each pair is built from a string in `queries` and the corresponding string in `documents` (similar to `zip()`).
|
||||
The total number of pairs is `len(documents)`.
|
||||
|
||||
??? console "Request"
|
||||
|
||||
```bash
|
||||
curl -X 'POST' \
|
||||
'http://127.0.0.1:8000/score' \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"model": "BAAI/bge-reranker-v2-m3",
|
||||
"encoding_format": "float",
|
||||
"queries": [
|
||||
"What is the capital of Brazil?",
|
||||
"What is the capital of France?"
|
||||
],
|
||||
"documents": [
|
||||
"The capital of Brazil is Brasilia.",
|
||||
"The capital of France is Paris."
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
??? console "Response"
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "score-request-id",
|
||||
"object": "list",
|
||||
"created": 693447,
|
||||
"model": "BAAI/bge-reranker-v2-m3",
|
||||
"data": [
|
||||
{
|
||||
"index": 0,
|
||||
"object": "score",
|
||||
"score": 1
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"object": "score",
|
||||
"score": 1
|
||||
}
|
||||
],
|
||||
"usage": {}
|
||||
}
|
||||
```
|
||||
|
||||
#### Multi-modal inputs
|
||||
|
||||
You can pass multi-modal inputs to scoring models by passing `content` including a list of multi-modal input (image, etc.) in the request. Refer to the examples below for illustration.
|
||||
|
||||
=== "JinaVL-Reranker"
|
||||
|
||||
To serve the model:
|
||||
|
||||
```bash
|
||||
vllm serve jinaai/jina-reranker-m0
|
||||
```
|
||||
|
||||
Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
|
||||
|
||||
??? Code
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
response = requests.post(
|
||||
"http://localhost:8000/v1/score",
|
||||
json={
|
||||
"model": "jinaai/jina-reranker-m0",
|
||||
"queries": "slm markdown",
|
||||
"documents": [
|
||||
{
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
|
||||
},
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
|
||||
},
|
||||
}
|
||||
]
|
||||
},
|
||||
],
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
response_json = response.json()
|
||||
print("Scoring output:", response_json["data"][0]["score"])
|
||||
print("Scoring output:", response_json["data"][1]["score"])
|
||||
```
|
||||
Full example:
|
||||
|
||||
- [examples/pooling/score/vision_score_api_online.py](../../examples/pooling/score/vision_score_api_online.py)
|
||||
- [examples/pooling/score/vision_rerank_api_online.py](../../examples/pooling/score/vision_rerank_api_online.py)
|
||||
|
||||
#### Extra parameters
|
||||
|
||||
The following [pooling parameters][vllm.PoolingParams] are supported.
|
||||
|
||||
```python
|
||||
--8<-- "vllm/pooling_params.py:common-pooling-params"
|
||||
--8<-- "vllm/pooling_params.py:classify-pooling-params"
|
||||
```
|
||||
|
||||
The following Score API parameters are supported:
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params"
|
||||
```
|
||||
|
||||
The following extra parameters are supported:
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params"
|
||||
```
|
||||
|
||||
### Re-rank API
|
||||
|
||||
Our Re-rank API can apply an embedding model or a cross-encoder model to predict relevant scores between a single query, and
|
||||
each of a list of documents. Usually, the score for a sentence pair refers to the similarity between two sentences or multi-modal inputs (image, etc.), on a scale of 0 to 1.
|
||||
|
||||
You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
|
||||
|
||||
The rerank endpoints support popular re-rank models such as `BAAI/bge-reranker-base` and other models supporting the
|
||||
`score` task. Additionally, `/rerank`, `/v1/rerank`, and `/v2/rerank`
|
||||
endpoints are compatible with both [Jina AI's re-rank API interface](https://jina.ai/reranker/) and
|
||||
[Cohere's re-rank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with
|
||||
popular open-source tools.
|
||||
|
||||
Code example: [examples/pooling/score/rerank_api_online.py](../../examples/pooling/score/rerank_api_online.py)
|
||||
|
||||
#### Example Request
|
||||
|
||||
Note that the `top_n` request parameter is optional and will default to the length of the `documents` field.
|
||||
Result documents will be sorted by relevance, and the `index` property can be used to determine original order.
|
||||
|
||||
??? console "Request"
|
||||
|
||||
```bash
|
||||
curl -X 'POST' \
|
||||
'http://127.0.0.1:8000/v1/rerank' \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"model": "BAAI/bge-reranker-base",
|
||||
"query": "What is the capital of France?",
|
||||
"documents": [
|
||||
"The capital of Brazil is Brasilia.",
|
||||
"The capital of France is Paris.",
|
||||
"Horses and cows are both animals"
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
??? console "Response"
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "rerank-fae51b2b664d4ed38f5969b612edff77",
|
||||
"model": "BAAI/bge-reranker-base",
|
||||
"usage": {
|
||||
"total_tokens": 56
|
||||
},
|
||||
"results": [
|
||||
{
|
||||
"index": 1,
|
||||
"document": {
|
||||
"text": "The capital of France is Paris."
|
||||
},
|
||||
"relevance_score": 0.99853515625
|
||||
},
|
||||
{
|
||||
"index": 0,
|
||||
"document": {
|
||||
"text": "The capital of Brazil is Brasilia."
|
||||
},
|
||||
"relevance_score": 0.0005860328674316406
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
#### Extra parameters
|
||||
|
||||
The following [pooling parameters][vllm.PoolingParams] are supported.
|
||||
|
||||
```python
|
||||
--8<-- "vllm/pooling_params.py:common-pooling-params"
|
||||
--8<-- "vllm/pooling_params.py:classify-pooling-params"
|
||||
```
|
||||
|
||||
The following Re-rank API parameters are supported:
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params"
|
||||
```
|
||||
|
||||
The following extra parameters are supported:
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params"
|
||||
```
|
||||
|
||||
## Ray Serve LLM
|
||||
|
||||
Ray Serve LLM enables scalable, production-grade serving of the vLLM engine. It integrates tightly with vLLM and extends it with features such as auto-scaling, load balancing, and back-pressure.
|
||||
|
||||
63
docs/training/async_rl.md
Normal file
63
docs/training/async_rl.md
Normal file
@@ -0,0 +1,63 @@
|
||||
# Async Reinforcement Learning
|
||||
|
||||
## Overview
|
||||
|
||||
In a standard RL training loop, generation and training happen sequentially: the policy generates rollouts, then training runs on those rollouts, and the cycle repeats. During generation the training accelerators sit idle, and vice versa.
|
||||
|
||||
The **one-off pipelining** approach separates the generation and training phases into two parallel coroutines, allowing the model to generate new samples while simultaneously training on previously generated data. This can lead to better GPU utilization and greater training throughput.
|
||||
|
||||
However, this overlap introduces a complication: weights must be updated in the inference engine mid-flight, while requests may still be in progress.
|
||||
|
||||
## The Pause and Resume API
|
||||
|
||||
To safely update weights while the inference engine is running, vLLM provides `pause_generation` and `resume_generation` methods. These let the trainer coordinate a clean window for weight synchronization without losing in-flight work.
|
||||
|
||||
### pause_generation
|
||||
|
||||
```python
|
||||
await engine.pause_generation(mode="keep", clear_cache=True)
|
||||
```
|
||||
|
||||
The `mode` parameter controls how in-flight requests are handled:
|
||||
|
||||
| Mode | Behavior |
|
||||
| ---- | -------- |
|
||||
| `"abort"` | Abort all in-flight requests immediately and return partial results (default) |
|
||||
| `"wait"` | Wait for all in-flight requests to finish before pausing |
|
||||
| `"keep"` | Freeze requests in the queue; they resume when `resume_generation` is called |
|
||||
|
||||
The `clear_cache` parameter controls whether to clear the KV cache and prefix cache after pausing.
|
||||
|
||||
### resume_generation
|
||||
|
||||
```python
|
||||
await engine.resume_generation()
|
||||
```
|
||||
|
||||
Resumes the scheduler after a pause. Any requests frozen with `mode="keep"` will continue generating.
|
||||
|
||||
### HTTP Endpoints
|
||||
|
||||
When using the vLLM HTTP server, the same functionality is available via:
|
||||
|
||||
- `POST /pause?mode=keep` - Pause generation
|
||||
- `POST /resume` - Resume generation
|
||||
|
||||
!!! note "Data Parallelism"
|
||||
When using data parallelism with vLLM's **internal load balancer** (i.e. `data_parallel_backend="ray"`), pause and resume are handled automatically across all DP ranks -- a single call is sufficient. When using an **external load balancer** (i.e. multiple independent vLLM instances behind a proxy), you must send pause and resume requests to **every** engine instance individually before and after the weight update.
|
||||
|
||||
## Typical Async RL Flow
|
||||
|
||||
A typical async RL loop with weight syncing looks like this:
|
||||
|
||||
1. Start generating rollouts from the current policy
|
||||
2. Once trainer has new weights to update to, pause generation with `mode="keep"`
|
||||
3. Sync the updated weights from the trainer to the inference engine (see [Weight Transfer](weight_transfer/README.md))
|
||||
4. Resume generation -- in-flight requests continue with the new weights
|
||||
5. Repeat
|
||||
|
||||
The key insight is that requests paused with `mode="keep"` will produce tokens from the **old** weights before the pause and tokens from the **new** weights after resume. The `clear_cache` parameter controls whether the KV cache is invalidated during the pause. When `clear_cache=True`, previously cached key-value entries are discarded, so all tokens generated after resume will be computed entirely with the new weights. When `clear_cache=False`, existing KV cache entries are retained, meaning some tokens in context may still reflect the old weights (stale KV cache).
|
||||
|
||||
## Example
|
||||
|
||||
The [async RLHF example](../examples/rl/rlhf_async_new_apis.md) demonstrates this pattern with `vllm.AsyncLLMEngine`, NCCL weight transfer, and mid-flight pause/resume with validation.
|
||||
@@ -16,11 +16,9 @@ The following open-source RL libraries use vLLM for fast rollouts (sorted alphab
|
||||
- [Unsloth](https://github.com/unslothai/unsloth)
|
||||
- [verl](https://github.com/volcengine/verl)
|
||||
|
||||
See the following basic examples to get started if you don't want to use an existing library:
|
||||
For weight synchronization between training and inference, see the [Weight Transfer](weight_transfer/README.md) documentation, which covers the pluggable backend system with [NCCL](weight_transfer/nccl.md) (multi-GPU) and [IPC](weight_transfer/ipc.md) (same-GPU) engines.
|
||||
|
||||
- [Training and inference processes are located on separate GPUs (inspired by OpenRLHF)](../examples/offline_inference/rlhf.md)
|
||||
- [Training and inference processes are colocated on the same GPUs using Ray](../examples/offline_inference/rlhf_colocate.md)
|
||||
- [Utilities for performing RLHF with vLLM](../examples/offline_inference/rlhf_utils.md)
|
||||
For pipelining generation and training to improve GPU utilization and throughput, see the [Async Reinforcement Learning](async_rl.md) guide, which covers the pause/resume API for safely updating weights mid-flight.
|
||||
|
||||
See the following notebooks showing how to use vLLM for GRPO:
|
||||
|
||||
|
||||
78
docs/training/weight_transfer/README.md
Normal file
78
docs/training/weight_transfer/README.md
Normal file
@@ -0,0 +1,78 @@
|
||||
# Weight Transfer
|
||||
|
||||
vLLM provides a pluggable weight transfer system for synchronizing model weights from a training process to the inference engine during reinforcement learning (RL) workflows. This is essential for RLHF, GRPO, and other online RL methods where the policy model is iteratively updated during training and the updated weights must be reflected in the inference engine for rollout generation.
|
||||
|
||||
## Architecture
|
||||
|
||||
The weight transfer system follows a **two-phase protocol** with a pluggable backend design:
|
||||
|
||||
1. **Initialization** (`init_weight_transfer_engine`): Establishes the communication channel between the trainer and inference workers. Called once before the training loop begins.
|
||||
2. **Weight Update** (`update_weights`): Transfers updated weights from the trainer to the inference engine. Called after each training step (or batch of steps).
|
||||
|
||||
## Available Backends
|
||||
|
||||
| Backend | Transport | Use Case |
|
||||
| ------- | --------- | -------- |
|
||||
| [NCCL](nccl.md) | NCCL broadcast | Separate GPUs for training and inference |
|
||||
| [IPC](ipc.md) | CUDA IPC handles | Colocated training and inference on same GPU |
|
||||
|
||||
## Configuration
|
||||
|
||||
Specify the weight transfer backend through `WeightTransferConfig`. The backend determines which engine handles the weight synchronization.
|
||||
|
||||
### Programmatic (Offline Inference)
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
from vllm.config import WeightTransferConfig
|
||||
|
||||
llm = LLM(
|
||||
model="my-model",
|
||||
weight_transfer_config=WeightTransferConfig(backend="nccl"), # or "ipc"
|
||||
)
|
||||
```
|
||||
|
||||
### CLI (Online Serving)
|
||||
|
||||
```bash
|
||||
vllm serve my-model \
|
||||
--weight-transfer-config '{"backend": "nccl"}'
|
||||
```
|
||||
|
||||
The `backend` field accepts `"nccl"` (default) or `"ipc"`.
|
||||
|
||||
## API Endpoints
|
||||
|
||||
When running vLLM as an HTTP server, the following endpoints are available for weight transfer:
|
||||
|
||||
| Endpoint | Method | Description |
|
||||
| -------- | ------ | ----------- |
|
||||
| `/init_weight_transfer_engine` | POST | Initialize the weight transfer engine with backend-specific info |
|
||||
| `/update_weights` | POST | Trigger a weight update with backend-specific metadata |
|
||||
| `/pause` | POST | Pause generation before weight sync to handle inflight requests |
|
||||
| `/resume` | POST | Resume generation after weight sync |
|
||||
| `/get_world_size` | GET | Get the number of inference workers (useful for NCCL world size calculation) |
|
||||
|
||||
!!! note
|
||||
The HTTP weight transfer endpoints require `VLLM_SERVER_DEV_MODE=1` to be set.
|
||||
|
||||
## Trainer-Side API
|
||||
|
||||
Both backends provide static methods that the trainer calls to send weights. The general pattern is:
|
||||
|
||||
```python
|
||||
# 1. Initialize the transfer engine (backend-specific)
|
||||
EngineClass.trainer_init(init_info)
|
||||
|
||||
# 2. Send weights to inference workers
|
||||
EngineClass.trainer_send_weights(
|
||||
iterator=model.named_parameters(),
|
||||
trainer_args=backend_specific_args,
|
||||
)
|
||||
```
|
||||
|
||||
See the [NCCL](nccl.md) and [IPC](ipc.md) pages for backend-specific trainer APIs and full examples.
|
||||
|
||||
## Extending the System
|
||||
|
||||
The weight transfer system is designed to be extensible. You can implement custom backends by subclassing `WeightTransferEngine` and registering them with the factory. See the [Base Class](base.md) page for details.
|
||||
162
docs/training/weight_transfer/base.md
Normal file
162
docs/training/weight_transfer/base.md
Normal file
@@ -0,0 +1,162 @@
|
||||
# Base Class and Custom Engines
|
||||
|
||||
The weight transfer system is built on an abstract base class that defines the contract between vLLM's worker infrastructure and the transport backend. You can implement custom backends by subclassing `WeightTransferEngine` and registering them with the `WeightTransferEngineFactory`.
|
||||
|
||||
## WeightTransferEngine
|
||||
|
||||
The `WeightTransferEngine` is a generic abstract class parameterized by two dataclass types:
|
||||
|
||||
- **`TInitInfo`** (extends `WeightTransferInitInfo`): Backend-specific initialization parameters.
|
||||
- **`TUpdateInfo`** (extends `WeightTransferUpdateInfo`): Backend-specific weight update metadata.
|
||||
|
||||
### Abstract Methods
|
||||
|
||||
Subclasses must implement these four methods:
|
||||
|
||||
| Method | Side | Description |
|
||||
| ------ | ---- | ----------- |
|
||||
| `init_transfer_engine(init_info)` | Inference | Initialize the communication channel on each inference worker |
|
||||
| `receive_weights(update_info, load_weights)` | Inference | Receive weights and call `load_weights` incrementally |
|
||||
| `shutdown()` | Inference | Clean up resources |
|
||||
| `trainer_send_weights(iterator, trainer_args)` | Trainer | Static method to send weights from the trainer process |
|
||||
|
||||
### Request Classes
|
||||
|
||||
The API-level request classes provide backend-agnostic serialization using plain dictionaries. The engine's `parse_init_info` and `parse_update_info` methods convert these dictionaries into typed dataclasses.
|
||||
|
||||
```python
|
||||
from vllm.distributed.weight_transfer.base import (
|
||||
WeightTransferInitRequest,
|
||||
WeightTransferUpdateRequest,
|
||||
)
|
||||
|
||||
# Init request (dict is converted to backend-specific TInitInfo)
|
||||
init_request = WeightTransferInitRequest(
|
||||
init_info={"master_address": "10.0.0.1", "master_port": 29500, ...}
|
||||
)
|
||||
|
||||
# Update request (dict is converted to backend-specific TUpdateInfo)
|
||||
update_request = WeightTransferUpdateRequest(
|
||||
update_info={"names": [...], "dtype_names": [...], "shapes": [...]}
|
||||
)
|
||||
```
|
||||
|
||||
### WeightTransferUpdateInfo
|
||||
|
||||
The base `WeightTransferUpdateInfo` includes an `is_checkpoint_format` flag:
|
||||
|
||||
```python
|
||||
@dataclass
|
||||
class WeightTransferUpdateInfo(ABC):
|
||||
is_checkpoint_format: bool = True
|
||||
```
|
||||
|
||||
When `is_checkpoint_format=True` (the default), vLLM applies layerwise weight processing (repacking, renaming, etc.) on the received weights before loading them. Set to `False` if the trainer has already converted weights to the kernel format expected by the model.
|
||||
|
||||
## Implementing a Custom Engine
|
||||
|
||||
To create a custom weight transfer backend:
|
||||
|
||||
### 1. Define Info Dataclasses
|
||||
|
||||
```python
|
||||
from dataclasses import dataclass
|
||||
from vllm.distributed.weight_transfer.base import (
|
||||
WeightTransferEngine,
|
||||
WeightTransferInitInfo,
|
||||
WeightTransferUpdateInfo,
|
||||
)
|
||||
|
||||
@dataclass
|
||||
class MyInitInfo(WeightTransferInitInfo):
|
||||
endpoint: str
|
||||
token: str
|
||||
|
||||
@dataclass
|
||||
class MyUpdateInfo(WeightTransferUpdateInfo):
|
||||
names: list[str]
|
||||
dtype_names: list[str]
|
||||
shapes: list[list[int]]
|
||||
# Add custom fields as needed
|
||||
```
|
||||
|
||||
### 2. Implement the Engine
|
||||
|
||||
```python
|
||||
from collections.abc import Callable, Iterator
|
||||
from typing import Any
|
||||
import torch
|
||||
|
||||
class MyWeightTransferEngine(WeightTransferEngine[MyInitInfo, MyUpdateInfo]):
|
||||
init_info_cls = MyInitInfo
|
||||
update_info_cls = MyUpdateInfo
|
||||
|
||||
def init_transfer_engine(self, init_info: MyInitInfo) -> None:
|
||||
# Set up connection to trainer using init_info.endpoint, etc.
|
||||
...
|
||||
|
||||
def receive_weights(
|
||||
self,
|
||||
update_info: MyUpdateInfo,
|
||||
load_weights: Callable[[list[tuple[str, torch.Tensor]]], None],
|
||||
) -> None:
|
||||
# Receive each weight and call load_weights incrementally
|
||||
for name, dtype_name, shape in zip(
|
||||
update_info.names, update_info.dtype_names, update_info.shapes
|
||||
):
|
||||
dtype = getattr(torch, dtype_name)
|
||||
weight = self._fetch_weight(name, shape, dtype)
|
||||
load_weights([(name, weight)])
|
||||
|
||||
def shutdown(self) -> None:
|
||||
# Clean up resources
|
||||
...
|
||||
|
||||
@staticmethod
|
||||
def trainer_send_weights(
|
||||
iterator: Iterator[tuple[str, torch.Tensor]],
|
||||
trainer_args: dict[str, Any],
|
||||
) -> None:
|
||||
# Send weights from the trainer process
|
||||
for name, tensor in iterator:
|
||||
# Send tensor via custom transport
|
||||
...
|
||||
```
|
||||
|
||||
!!! important
|
||||
The `load_weights` callable passed to `receive_weights` should be called **incrementally** (one or a few weights at a time) rather than accumulating all weights first. This avoids GPU out-of-memory errors with large models.
|
||||
|
||||
### 3. Register with the Factory
|
||||
|
||||
```python
|
||||
from vllm.distributed.weight_transfer.factory import WeightTransferEngineFactory
|
||||
|
||||
# Option 1: Lazy loading (recommended for built-in engines)
|
||||
WeightTransferEngineFactory.register_engine(
|
||||
"my_backend",
|
||||
"my_package.my_module",
|
||||
"MyWeightTransferEngine",
|
||||
)
|
||||
|
||||
# Option 2: Direct class registration
|
||||
WeightTransferEngineFactory.register_engine(
|
||||
"my_backend",
|
||||
MyWeightTransferEngine,
|
||||
)
|
||||
```
|
||||
|
||||
Once registered, users can select your backend via `WeightTransferConfig(backend="my_backend")`.
|
||||
|
||||
## WeightTransferEngineFactory
|
||||
|
||||
The factory uses a registry pattern with lazy loading. Built-in engines (`nccl` and `ipc`) are registered at import time but their modules are only loaded when the backend is actually requested. This avoids importing heavy dependencies (like NCCL communicators) when they aren't needed.
|
||||
|
||||
```python
|
||||
from vllm.distributed.weight_transfer.factory import WeightTransferEngineFactory
|
||||
|
||||
# Create an engine from config
|
||||
engine = WeightTransferEngineFactory.create_engine(
|
||||
config=weight_transfer_config,
|
||||
parallel_config=parallel_config,
|
||||
)
|
||||
```
|
||||
73
docs/training/weight_transfer/ipc.md
Normal file
73
docs/training/weight_transfer/ipc.md
Normal file
@@ -0,0 +1,73 @@
|
||||
# IPC Engine
|
||||
|
||||
The IPC weight transfer engine uses **CUDA IPC** (Inter-Process Communication) handles to share GPU memory directly between the trainer and inference workers on the **same node and same GPU**. This avoids any data copying, making it a efficient option when colocating training and inference.
|
||||
|
||||
## When to Use IPC
|
||||
|
||||
- Training and inference on the **same GPU** (colocated)
|
||||
- You want to minimize memory overhead by sharing tensors in-place
|
||||
|
||||
## How It Works
|
||||
|
||||
1. The trainer creates CUDA tensors for each weight and generates IPC handles using `torch.multiprocessing.reductions.reduce_tensor`.
|
||||
2. IPC handles are sent to the inference engine via **Ray.remote()** or **HTTP POST**.
|
||||
3. The inference worker reconstructs the tensors from the handles, reading directly from the trainer's GPU memory.
|
||||
|
||||
!!! warning
|
||||
IPC handles involve sending serialized Python objects. When using HTTP transport, you must set `VLLM_ALLOW_INSECURE_SERIALIZATION=1` on both the server and client. This is because IPC handles are pickled and base64-encoded for HTTP transmission.
|
||||
|
||||
## Initialization
|
||||
|
||||
The IPC backend requires no initialization on either side. The `init_transfer_engine` call is a no-op for IPC.
|
||||
|
||||
## Sending Weights
|
||||
|
||||
IPC supports two transport modes for delivering the handles:
|
||||
|
||||
### Ray Mode
|
||||
|
||||
Used when vLLM is running as a Ray actor:
|
||||
|
||||
```python
|
||||
from vllm.distributed.weight_transfer.ipc_engine import (
|
||||
IPCTrainerSendWeightsArgs,
|
||||
IPCWeightTransferEngine,
|
||||
)
|
||||
|
||||
trainer_args = IPCTrainerSendWeightsArgs(
|
||||
mode="ray",
|
||||
llm_handle=llm_actor_handle,
|
||||
)
|
||||
|
||||
IPCWeightTransferEngine.trainer_send_weights(
|
||||
iterator=model.named_parameters(),
|
||||
trainer_args=trainer_args,
|
||||
)
|
||||
```
|
||||
|
||||
In Ray mode, the engine calls `llm_handle.update_weights.remote(...)` directly, passing the IPC handles via Ray's serialization.
|
||||
|
||||
### HTTP Mode
|
||||
|
||||
Used when vLLM is running as an HTTP server:
|
||||
|
||||
```python
|
||||
trainer_args = IPCTrainerSendWeightsArgs(
|
||||
mode="http",
|
||||
url="http://localhost:8000",
|
||||
)
|
||||
|
||||
IPCWeightTransferEngine.trainer_send_weights(
|
||||
iterator=model.named_parameters(),
|
||||
trainer_args=trainer_args,
|
||||
)
|
||||
```
|
||||
|
||||
In HTTP mode, IPC handles are pickled, base64-encoded, and sent as JSON to the `/update_weights` endpoint.
|
||||
|
||||
See [`IPCTrainerSendWeightsArgs`](https://github.com/vllm-project/vllm/blob/main/vllm/distributed/weight_transfer/ipc_engine.py) for the full list of configurable fields.
|
||||
|
||||
## Examples
|
||||
|
||||
- [RLHF with IPC weight syncing (offline, Ray)](../../examples/rl/rlhf_ipc.md) - Colocated training and inference on a single GPU using Ray placement groups and CUDA IPC handles
|
||||
- [RLHF with IPC weight syncing (online serving, HTTP)](../../examples/rl/rlhf_http_ipc.md) - Weight transfer with a vLLM HTTP server where both server and trainer share the same GPU
|
||||
110
docs/training/weight_transfer/nccl.md
Normal file
110
docs/training/weight_transfer/nccl.md
Normal file
@@ -0,0 +1,110 @@
|
||||
# NCCL Engine
|
||||
|
||||
The NCCL weight transfer engine uses [NCCL](https://developer.nvidia.com/nccl) broadcast operations to transfer weights from the trainer to inference workers. It supports **multi-node** and **multi-GPU** setups where the trainer and inference engine run on separate GPUs.
|
||||
|
||||
## When to Use NCCL
|
||||
|
||||
- Training and inference on **separate GPUs** (possibly across nodes)
|
||||
- **Tensor-parallel** inference with multiple workers that all need the updated weights
|
||||
- You need high-bandwidth, low-latency weight transfer over NVLink or InfiniBand
|
||||
|
||||
## How It Works
|
||||
|
||||
1. The trainer and all inference workers join a shared NCCL process group using `StatelessProcessGroup` (vLLM's torch.distributed-independent group abstraction).
|
||||
2. The trainer broadcasts weights to all workers simultaneously. Each worker receives and loads weights incrementally.
|
||||
3. Optionally, **packed tensor broadcasting** batches multiple small tensors into larger buffers with double/triple buffering and CUDA stream overlap for higher throughput. This implementation is based on [NeMo-RL's packed tensor](https://github.com/NVIDIA-NeMo/RL/blob/main/nemo_rl/utils/packed_tensor.py).
|
||||
|
||||
## Initialization
|
||||
|
||||
NCCL requires explicit process group setup. The trainer and inference workers must agree on a master address, port, and world size.
|
||||
|
||||
### Inference Side
|
||||
|
||||
```python
|
||||
from vllm.distributed.weight_transfer.base import WeightTransferInitRequest
|
||||
|
||||
# rank_offset accounts for the trainer occupying rank 0
|
||||
llm.init_weight_transfer_engine(
|
||||
WeightTransferInitRequest(
|
||||
init_info=dict(
|
||||
master_address=master_address,
|
||||
master_port=master_port,
|
||||
rank_offset=1,
|
||||
world_size=world_size, # trainer + all inference workers
|
||||
)
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
### Trainer Side
|
||||
|
||||
```python
|
||||
from vllm.distributed.weight_transfer.nccl_engine import (
|
||||
NCCLWeightTransferEngine,
|
||||
)
|
||||
|
||||
group = NCCLWeightTransferEngine.trainer_init(
|
||||
dict(
|
||||
master_address=master_address,
|
||||
master_port=master_port,
|
||||
world_size=world_size,
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
!!! note
|
||||
`trainer_init` always assigns the trainer to rank 0. Inference workers start at `rank_offset` (typically 1).
|
||||
|
||||
## Sending Weights
|
||||
|
||||
```python
|
||||
from vllm.distributed.weight_transfer.nccl_engine import (
|
||||
NCCLTrainerSendWeightsArgs,
|
||||
NCCLWeightTransferEngine,
|
||||
)
|
||||
|
||||
trainer_args = NCCLTrainerSendWeightsArgs(
|
||||
group=group,
|
||||
packed=True, # use packed broadcasting for efficiency
|
||||
)
|
||||
|
||||
NCCLWeightTransferEngine.trainer_send_weights(
|
||||
iterator=model.named_parameters(),
|
||||
trainer_args=trainer_args,
|
||||
)
|
||||
```
|
||||
|
||||
See [`NCCLTrainerSendWeightsArgs`](https://github.com/vllm-project/vllm/blob/main/vllm/distributed/weight_transfer/nccl_engine.py) for the full list of configurable fields.
|
||||
|
||||
### Packed Tensor Broadcasting
|
||||
|
||||
When `packed=True`, multiple weight tensors are packed into large contiguous buffers before broadcasting. This reduces the number of NCCL operations and uses double/triple buffering with dedicated CUDA streams for overlap between packing, broadcasting, and unpacking.
|
||||
|
||||
Both the trainer (`NCCLTrainerSendWeightsArgs`) and inference side (`NCCLWeightTransferUpdateInfo`) must use matching `packed_buffer_size_bytes` and `packed_num_buffers` values.
|
||||
|
||||
## Receiving Weights (Inference Side)
|
||||
|
||||
The inference side triggers weight reception by calling `update_weights`:
|
||||
|
||||
```python
|
||||
from vllm.distributed.weight_transfer.base import WeightTransferUpdateRequest
|
||||
|
||||
llm.update_weights(
|
||||
WeightTransferUpdateRequest(
|
||||
update_info=dict(
|
||||
names=names,
|
||||
dtype_names=dtype_names,
|
||||
shapes=shapes,
|
||||
packed=True,
|
||||
)
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
The `names`, `dtype_names`, and `shapes` lists describe each parameter. These must match the order in which the trainer iterates over its parameters.
|
||||
|
||||
## Examples
|
||||
|
||||
- [RLHF with NCCL weight syncing (offline, Ray)](../../examples/rl/rlhf_nccl.md) - Trainer on one GPU, 2x tensor-parallel vLLM engine on two others, with packed NCCL weight broadcast
|
||||
- [RLHF with async weight syncing (offline, Ray)](../../examples/rl/rlhf_async_new_apis.md) - Async generation with mid-flight pause, weight sync, resume, and validation against a fresh model
|
||||
- [RLHF with NCCL weight syncing (online serving, HTTP)](../../examples/rl/rlhf_http_nccl.md) - Weight transfer with a running vLLM HTTP server using HTTP control plane and NCCL data plane
|
||||
@@ -70,6 +70,29 @@ def run_audioflamingo3(question: str, audio_count: int) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
# CohereASR
|
||||
def run_cohere_asr(question: str, audio_count: int) -> ModelRequestData:
|
||||
assert audio_count == 1, "CohereASR only support single audio input per prompt"
|
||||
# TODO (ekagra): add HF ckpt after asr release
|
||||
model_name = "/host/engines/vllm/audio/2b-release"
|
||||
|
||||
prompt = (
|
||||
"<|startofcontext|><|startoftranscript|>"
|
||||
"<|emo:undefined|><|en|><|en|><|pnc|><|noitn|>"
|
||||
"<|notimestamp|><|nodiarize|>"
|
||||
)
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
limit_mm_per_prompt={"audio": audio_count},
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
)
|
||||
|
||||
|
||||
# MusicFlamingo
|
||||
def run_musicflamingo(question: str, audio_count: int) -> ModelRequestData:
|
||||
model_name = "nvidia/music-flamingo-2601-hf"
|
||||
@@ -508,14 +531,15 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:
|
||||
|
||||
model_example_map = {
|
||||
"audioflamingo3": run_audioflamingo3,
|
||||
"musicflamingo": run_musicflamingo,
|
||||
"cohere_asr": run_cohere_asr,
|
||||
"funaudiochat": run_funaudiochat,
|
||||
"gemma3n": run_gemma3n,
|
||||
"glmasr": run_glmasr,
|
||||
"funaudiochat": run_funaudiochat,
|
||||
"granite_speech": run_granite_speech,
|
||||
"kimi_audio": run_kimi_audio,
|
||||
"midashenglm": run_midashenglm,
|
||||
"minicpmo": run_minicpmo,
|
||||
"musicflamingo": run_musicflamingo,
|
||||
"phi4_mm": run_phi4mm,
|
||||
"qwen2_audio": run_qwen2_audio,
|
||||
"qwen2_5_omni": run_qwen2_5_omni,
|
||||
|
||||
@@ -1,147 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Demonstrates reinforcement learning from human feedback (RLHF) using vLLM and Ray.
|
||||
|
||||
The script separates training and inference workloads onto distinct GPUs
|
||||
so that Ray can manage process placement and inter-process communication.
|
||||
A Hugging Face Transformer model occupies GPU 0 for training, whereas a
|
||||
tensor-parallel vLLM inference engine occupies GPU 1–2.
|
||||
|
||||
The example performs the following steps:
|
||||
|
||||
* Load the training model on GPU 0.
|
||||
* Split the inference model across GPUs 1–2 using vLLM's tensor parallelism
|
||||
and Ray placement groups.
|
||||
* Generate text from a list of prompts using the inference engine.
|
||||
* Update the weights of the training model and broadcast the updated weights
|
||||
to the inference engine by using a Ray collective RPC group. Note that
|
||||
for demonstration purposes we simply zero out the weights.
|
||||
|
||||
For a production-ready implementation that supports multiple training and
|
||||
inference replicas, see the OpenRLHF framework:
|
||||
https://github.com/OpenRLHF/OpenRLHF
|
||||
|
||||
This example assumes a single-node cluster with three GPUs, but Ray
|
||||
supports multi-node clusters. vLLM expects the GPUs are only used for vLLM
|
||||
workloads. Residual GPU activity interferes with vLLM memory profiling and
|
||||
causes unexpected behavior.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import ray
|
||||
import torch
|
||||
from ray.util.placement_group import placement_group
|
||||
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
|
||||
from rlhf_utils import stateless_init_process_group
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.utils.network_utils import get_ip, get_open_port
|
||||
|
||||
|
||||
class MyLLM(LLM):
|
||||
"""Configure the vLLM worker for Ray placement group execution."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
# Remove the top-level CUDA_VISIBLE_DEVICES variable set by Ray
|
||||
# so that vLLM can manage its own device placement within the worker.
|
||||
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
|
||||
# Load the OPT-125M model onto GPU 0 for the training workload.
|
||||
train_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
|
||||
train_model.to("cuda:0")
|
||||
|
||||
# Initialize Ray and set the visible devices. The vLLM engine will
|
||||
# be placed on GPUs 1 and 2.
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
|
||||
ray.init()
|
||||
|
||||
# Create a placement group that reserves GPU 1–2 for the vLLM inference engine.
|
||||
# Learn more about Ray placement groups:
|
||||
# https://docs.ray.io/en/latest/ray-core/scheduling/placement-group.html
|
||||
pg_inference = placement_group([{"GPU": 1, "CPU": 0}] * 2)
|
||||
ray.get(pg_inference.ready())
|
||||
scheduling_inference = PlacementGroupSchedulingStrategy(
|
||||
placement_group=pg_inference,
|
||||
placement_group_capture_child_tasks=True,
|
||||
placement_group_bundle_index=0,
|
||||
)
|
||||
|
||||
# Launch the vLLM inference engine. The `enforce_eager` flag reduces
|
||||
# start-up latency.
|
||||
llm = ray.remote(
|
||||
num_cpus=0,
|
||||
num_gpus=0,
|
||||
scheduling_strategy=scheduling_inference,
|
||||
)(MyLLM).remote(
|
||||
model="facebook/opt-125m",
|
||||
enforce_eager=True,
|
||||
worker_extension_cls="rlhf_utils.WorkerExtension",
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend="ray",
|
||||
)
|
||||
|
||||
# Generate text from the prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
sampling_params = SamplingParams(temperature=0)
|
||||
|
||||
outputs = ray.get(llm.generate.remote(prompts, sampling_params))
|
||||
|
||||
print("-" * 50)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
|
||||
print("-" * 50)
|
||||
|
||||
# Set up the communication channel between the training process and the
|
||||
# inference engine.
|
||||
master_address = get_ip()
|
||||
master_port = get_open_port()
|
||||
|
||||
handle = llm.collective_rpc.remote(
|
||||
"init_weight_update_group", args=(master_address, master_port, 1, 3)
|
||||
)
|
||||
|
||||
model_update_group = stateless_init_process_group(
|
||||
master_address, master_port, 0, 3, torch.device("cuda:0")
|
||||
)
|
||||
ray.get(handle)
|
||||
|
||||
# Simulate a training step by zeroing out all model weights.
|
||||
# In a real RLHF training loop the weights would be updated using the gradient
|
||||
# from an RL objective such as PPO on a reward model.
|
||||
for name, p in train_model.named_parameters():
|
||||
p.data.zero_()
|
||||
|
||||
# Synchronize the updated weights to the inference engine.
|
||||
for name, p in train_model.named_parameters():
|
||||
dtype_name = str(p.dtype).split(".")[-1]
|
||||
handle = llm.collective_rpc.remote(
|
||||
"update_weight", args=(name, dtype_name, p.shape)
|
||||
)
|
||||
model_update_group.broadcast(p, src=0, stream=torch.cuda.current_stream())
|
||||
ray.get(handle)
|
||||
|
||||
# Verify that the inference weights have been updated.
|
||||
assert all(ray.get(llm.collective_rpc.remote("check_weights_changed")))
|
||||
|
||||
# Generate text with the updated model. The output is expected to be nonsense
|
||||
# because the weights are zero.
|
||||
outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params))
|
||||
print("-" * 50)
|
||||
for output in outputs_updated:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
|
||||
print("-" * 50)
|
||||
@@ -1,256 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Demonstrates how to co-locate a vLLM inference worker and training
|
||||
actors on the same set of GPUs for reinforcement learning from human feedback
|
||||
(RLHF) workloads.
|
||||
|
||||
Ray serves as the distributed execution framework in this example. Ray
|
||||
placement groups allocate both training actors and vLLM workers to the
|
||||
same GPU bundles, enabling fast, in-GPU communication between the two
|
||||
components.
|
||||
|
||||
The script shows how to do the following:
|
||||
|
||||
* Configure environment variables (`VLLM_RAY_PER_WORKER_GPUS` and
|
||||
`VLLM_RAY_BUNDLE_INDICES`) so that vLLM workers land on the desired
|
||||
devices.
|
||||
* Exchange tensors between processes by means of CUDA inter-process
|
||||
communication (IPC). CUDA IPC sidesteps NCCL limitations that occur
|
||||
when multiple processes share a single GPU.
|
||||
|
||||
Note that this example assumes a single-node cluster with four GPUs, but Ray
|
||||
supports multi-node clusters. vLLM expects exclusive use of the GPUs during
|
||||
its initialization for memory profiling. Residual GPU activity interferes
|
||||
with vLLM memory profiling and causes unexpected behavior.
|
||||
|
||||
Learn more about Ray placement groups:
|
||||
https://docs.ray.io/en/latest/placement-groups.html
|
||||
"""
|
||||
|
||||
import gc
|
||||
import os
|
||||
import sys
|
||||
|
||||
import ray
|
||||
import torch
|
||||
import zmq
|
||||
from ray.util.placement_group import placement_group
|
||||
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
|
||||
from torch.multiprocessing.reductions import reduce_tensor
|
||||
|
||||
from vllm import LLM
|
||||
|
||||
if torch.version.hip is not None:
|
||||
print("Skipping test for ROCm. Ray is unsupported on vLLM ROCm.")
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
class MyLLM(LLM):
|
||||
"""Configure the vLLM worker for Ray placement group execution.
|
||||
|
||||
The constructor sets environment variables that allow multiple vLLM
|
||||
workers to share a single physical GPU and that encode the bundle
|
||||
indices assigned by the placement group.
|
||||
|
||||
Args:
|
||||
*args: Positional arguments forwarded to `vllm.LLM`.
|
||||
bundle_indices (list[int]): Placement-group bundle indices
|
||||
assigned to this worker.
|
||||
**kwargs: Keyword arguments forwarded to `vllm.LLM`.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, bundle_indices: list[int], **kwargs):
|
||||
# Prevent Ray from manipulating the top-level CUDA_VISIBLE_DEVICES variable
|
||||
# so that vLLM can its own device placement inside the worker.
|
||||
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
|
||||
# Each worker uses 0.4 GPU so that two instances fit on the same GPUs.
|
||||
os.environ["VLLM_RAY_PER_WORKER_GPUS"] = "0.4"
|
||||
os.environ["VLLM_RAY_BUNDLE_INDICES"] = ",".join(map(str, bundle_indices))
|
||||
print(f"creating LLM with bundle_indices={bundle_indices}")
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
|
||||
class RayTrainingActor:
|
||||
"""Training actor that hosts a Facebook OPT-125M model from Hugging Face.
|
||||
|
||||
The model is loaded onto the first GPU assigned to this actor, and expose
|
||||
the CUDA IPC handles so that colocated vLLM workers can map tensors
|
||||
directly.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
# Ray sets CUDA_VISIBLE_DEVICES to the GPUs assigned to this actor.
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
self.model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
|
||||
self.model.to("cuda:0")
|
||||
# Zero out all the parameters.
|
||||
for name, p in self.model.named_parameters():
|
||||
p.data.zero_()
|
||||
torch.accelerator.synchronize()
|
||||
# The argument for `get_device_uuid` is the index of the GPU in the
|
||||
# list of visible devices.
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
self.device_uuid = current_platform.get_device_uuid(0)
|
||||
self.zmq_context = zmq.Context()
|
||||
self.zmq_address_counter = 0
|
||||
self.zmq_handle = None
|
||||
|
||||
def report_device_id(self) -> str:
|
||||
return self.device_uuid
|
||||
|
||||
def get_zmq_handles(self) -> dict[str, str]:
|
||||
suffix = f"{self.device_uuid}-{self.zmq_address_counter}"
|
||||
self.zmq_handle = f"ipc:///tmp/rl-colocate-zmq-{suffix}.sock"
|
||||
self.zmq_address_counter += 1
|
||||
return {self.device_uuid: self.zmq_handle}
|
||||
|
||||
def update_weights(self):
|
||||
# align size to avoid misaligned address
|
||||
align_size = 256
|
||||
|
||||
def get_size(p: torch.Tensor) -> int:
|
||||
return (p.nbytes + align_size - 1) // align_size * align_size
|
||||
|
||||
named_parameters: dict[str, torch.nn.Parameter] = dict(
|
||||
self.model.named_parameters()
|
||||
)
|
||||
max_tensor_size = max(get_size(p) for p in named_parameters.values())
|
||||
# use max_tensor_size * 2 as buffer size
|
||||
buffer = torch.empty(max_tensor_size * 2, dtype=torch.uint8, device="cuda:0")
|
||||
s = self.zmq_context.socket(zmq.REQ)
|
||||
s.bind(self.zmq_handle)
|
||||
handle = reduce_tensor(buffer)
|
||||
|
||||
offset = 0
|
||||
buckets: list[tuple[list[dict], list[torch.Tensor]]] = []
|
||||
named_tensors: list[dict] = []
|
||||
real_tensors: list[torch.Tensor] = []
|
||||
for name, p in named_parameters.items():
|
||||
size = get_size(p)
|
||||
if offset + size > buffer.numel():
|
||||
buckets.append((named_tensors, real_tensors))
|
||||
named_tensors, real_tensors = [], []
|
||||
offset = 0
|
||||
# assume tensors are contiguous
|
||||
named_tensors.append(
|
||||
{"name": name, "dtype": p.dtype, "shape": p.shape, "offset": offset}
|
||||
)
|
||||
real_tensors.append(p)
|
||||
offset += size
|
||||
if named_tensors:
|
||||
buckets.append((named_tensors, real_tensors))
|
||||
s.send_pyobj(handle)
|
||||
s.recv()
|
||||
for named_tensors, real_tensors in buckets:
|
||||
offset = 0
|
||||
for p in real_tensors:
|
||||
buffer[offset : offset + p.nbytes].data.copy_(
|
||||
p.data.view(-1).view(dtype=torch.uint8), non_blocking=True
|
||||
)
|
||||
offset += get_size(p)
|
||||
torch.accelerator.synchronize()
|
||||
s.send_pyobj(named_tensors)
|
||||
s.recv()
|
||||
s.send_pyobj(None)
|
||||
s.recv()
|
||||
s.close()
|
||||
del buffer
|
||||
gc.collect()
|
||||
torch.accelerator.empty_cache()
|
||||
|
||||
|
||||
# Ray manages four GPUs.
|
||||
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
|
||||
ray.init()
|
||||
|
||||
# Co-locate vLLM instances and training actors on the same set of GPUs:
|
||||
# * GPU 0 and 1: training actor 0, training actor 1, and vLLM instance 0
|
||||
# (tensor parallelism = 2).
|
||||
# * GPU 2 and 3: training actor 2, training actor 3, and vLLM instance 1
|
||||
# (tensor parallelism = 2).
|
||||
|
||||
pg = placement_group([{"GPU": 1, "CPU": 0}] * 4)
|
||||
ray.get(pg.ready())
|
||||
print(f"placement group has bundles {pg.bundle_specs=}")
|
||||
|
||||
training_actors = []
|
||||
training_actor_device_ids = []
|
||||
inference_engines = []
|
||||
inference_engine_device_ids = []
|
||||
|
||||
for bundle_index in [0, 1, 2, 3]:
|
||||
training_actor = ray.remote(
|
||||
num_cpus=0,
|
||||
num_gpus=0.4,
|
||||
scheduling_strategy=PlacementGroupSchedulingStrategy(
|
||||
placement_group=pg,
|
||||
placement_group_capture_child_tasks=True,
|
||||
placement_group_bundle_index=bundle_index,
|
||||
),
|
||||
)(RayTrainingActor).remote()
|
||||
training_actors.append(training_actor)
|
||||
|
||||
for bundle_index, training_actor in enumerate(training_actors):
|
||||
device_id = ray.get(training_actor.report_device_id.remote())
|
||||
print(f"training actor {bundle_index} is on {device_id}")
|
||||
training_actor_device_ids.append(device_id)
|
||||
|
||||
for i, bundle_indices in enumerate([[0, 1], [2, 3]]):
|
||||
# Use the following syntax instead of the @ray.remote decorator so that
|
||||
# the placement group is customized for each bundle.
|
||||
llm = ray.remote(
|
||||
num_cpus=0,
|
||||
num_gpus=0,
|
||||
scheduling_strategy=PlacementGroupSchedulingStrategy(
|
||||
placement_group=pg,
|
||||
placement_group_capture_child_tasks=True,
|
||||
),
|
||||
)(MyLLM).remote(
|
||||
model="facebook/opt-125m",
|
||||
enforce_eager=True,
|
||||
worker_extension_cls="rlhf_utils.ColocateWorkerExtension",
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend="ray",
|
||||
gpu_memory_utilization=0.4,
|
||||
bundle_indices=bundle_indices,
|
||||
)
|
||||
inference_engines.append(llm)
|
||||
# Do not call any method on the inference engine at this point; the call
|
||||
# blocks until the vLLM instance finishes initialization.
|
||||
|
||||
for i, llm in enumerate(inference_engines):
|
||||
inference_engine_device_ids.append(
|
||||
ray.get(llm.collective_rpc.remote("report_device_id", args=tuple()))
|
||||
)
|
||||
print(f"inference engine {i} is on {inference_engine_device_ids[-1]}")
|
||||
|
||||
# Verify placement: the first two training actors share the same GPUs as
|
||||
# the first inference engine.
|
||||
assert training_actor_device_ids[:2] == inference_engine_device_ids[0]
|
||||
# Verify placement: the last two training actors share the same GPUs as
|
||||
# the second inference engine.
|
||||
assert training_actor_device_ids[2:] == inference_engine_device_ids[1]
|
||||
|
||||
print("Gather all the ZMQ handles from the training actors.")
|
||||
zmq_handles = {}
|
||||
for actor in training_actors:
|
||||
zmq_handles.update(ray.get(actor.get_zmq_handles.remote()))
|
||||
|
||||
print(f"ZMQ handles: {zmq_handles}")
|
||||
|
||||
print("Update the weights of the inference engines.")
|
||||
ray.get(
|
||||
[actor.update_weights.remote() for actor in training_actors]
|
||||
+ [
|
||||
llm.collective_rpc.remote("update_weights_from_ipc", args=(zmq_handles,))
|
||||
for llm in inference_engines
|
||||
]
|
||||
)
|
||||
|
||||
print("Check if the weights are updated.")
|
||||
for llm in inference_engines:
|
||||
assert ray.get(llm.collective_rpc.remote("check_weights_changed", args=tuple()))
|
||||
@@ -1,162 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Demonstrates reinforcement learning from human feedback (RLHF) using vLLM and Ray.
|
||||
|
||||
The script separates training and inference workloads onto distinct GPUs
|
||||
so that Ray can manage process placement and inter-process communication.
|
||||
A Hugging Face Transformer model occupies GPU 0 for training, whereas a
|
||||
tensor-parallel vLLM inference engine occupies GPU 1–2.
|
||||
|
||||
The example performs the following steps:
|
||||
|
||||
* Load the training model on GPU 0.
|
||||
* Split the inference model across GPUs 1–2 using vLLM's tensor parallelism
|
||||
and Ray placement groups.
|
||||
* Generate text from a list of prompts using the inference engine.
|
||||
* Update the weights of the training model and broadcast the updated weights
|
||||
to the inference engine by using a Ray collective RPC group. Note that
|
||||
for demonstration purposes we simply zero out the weights.
|
||||
|
||||
For a production-ready implementation that supports multiple training and
|
||||
inference replicas, see the OpenRLHF framework:
|
||||
https://github.com/OpenRLHF/OpenRLHF
|
||||
|
||||
This example assumes a single-node cluster with three GPUs, but Ray
|
||||
supports multi-node clusters. vLLM expects the GPUs are only used for vLLM
|
||||
workloads. Residual GPU activity interferes with vLLM memory profiling and
|
||||
causes unexpected behavior.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
|
||||
import ray
|
||||
import torch
|
||||
from ray.util.placement_group import placement_group
|
||||
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
|
||||
from rlhf_utils import stateless_init_process_group
|
||||
from torchao.core.config import config_to_dict
|
||||
from torchao.quantization import (
|
||||
Float8DynamicActivationFloat8WeightConfig,
|
||||
PerRow,
|
||||
)
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.utils.network_utils import get_ip, get_open_port
|
||||
|
||||
|
||||
class MyLLM(LLM):
|
||||
"""Configure the vLLM worker for Ray placement group execution."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
# Remove the top-level CUDA_VISIBLE_DEVICES variable set by Ray
|
||||
# so that vLLM can manage its own device placement within the worker.
|
||||
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
|
||||
# Load the OPT-125M model onto GPU 0 for the training workload.
|
||||
train_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
|
||||
train_model.to("cuda:0")
|
||||
|
||||
# Initialize Ray and set the visible devices. The vLLM engine will
|
||||
# be placed on GPUs 1 and 2.
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
|
||||
ray.init()
|
||||
|
||||
# Create a placement group that reserves GPU 1–2 for the vLLM inference engine.
|
||||
# Learn more about Ray placement groups:
|
||||
# https://docs.ray.io/en/latest/ray-core/scheduling/placement-group.html
|
||||
pg_inference = placement_group([{"GPU": 1, "CPU": 0}] * 2)
|
||||
ray.get(pg_inference.ready())
|
||||
scheduling_inference = PlacementGroupSchedulingStrategy(
|
||||
placement_group=pg_inference,
|
||||
placement_group_capture_child_tasks=True,
|
||||
placement_group_bundle_index=0,
|
||||
)
|
||||
|
||||
# Launch the vLLM inference engine. The `enforce_eager` flag reduces
|
||||
# start-up latency.
|
||||
|
||||
# generate torchao quantization config for RL rollout
|
||||
# see https://github.com/vllm-project/vllm/pull/23014 for instructions to
|
||||
# use serialized config files instead of passing around json string
|
||||
config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
|
||||
|
||||
json_str = json.dumps(config_to_dict(config))
|
||||
|
||||
llm = ray.remote(
|
||||
num_cpus=0,
|
||||
num_gpus=0,
|
||||
scheduling_strategy=scheduling_inference,
|
||||
)(MyLLM).remote(
|
||||
model="facebook/opt-125m",
|
||||
hf_overrides={"quantization_config_dict_json": json_str},
|
||||
enforce_eager=True,
|
||||
worker_extension_cls="rlhf_utils.WorkerExtension",
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend="ray",
|
||||
)
|
||||
|
||||
# Generate text from the prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
sampling_params = SamplingParams(temperature=0)
|
||||
|
||||
outputs = ray.get(llm.generate.remote(prompts, sampling_params))
|
||||
|
||||
print("-" * 50)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
|
||||
print("-" * 50)
|
||||
|
||||
# Set up the communication channel between the training process and the
|
||||
# inference engine.
|
||||
master_address = get_ip()
|
||||
master_port = get_open_port()
|
||||
|
||||
handle = llm.collective_rpc.remote(
|
||||
"init_weight_update_group", args=(master_address, master_port, 1, 3)
|
||||
)
|
||||
|
||||
model_update_group = stateless_init_process_group(
|
||||
master_address, master_port, 0, 3, torch.device("cuda:0")
|
||||
)
|
||||
ray.get(handle)
|
||||
|
||||
# Simulate a training step by zeroing out all model weights.
|
||||
# In a real RLHF training loop the weights would be updated using the gradient
|
||||
# from an RL objective such as PPO on a reward model.
|
||||
for name, p in train_model.named_parameters():
|
||||
p.data.zero_()
|
||||
|
||||
# Synchronize the updated weights to the inference engine.
|
||||
for name, p in train_model.named_parameters():
|
||||
dtype_name = str(p.dtype).split(".")[-1]
|
||||
handle = llm.collective_rpc.remote(
|
||||
"update_weight", args=(name, dtype_name, p.shape)
|
||||
)
|
||||
model_update_group.broadcast(p, src=0, stream=torch.cuda.current_stream())
|
||||
ray.get(handle)
|
||||
|
||||
# Verify that the inference weights have been updated.
|
||||
assert all(ray.get(llm.collective_rpc.remote("check_weights_changed")))
|
||||
|
||||
# Generate text with the updated model. The output is expected to be nonsense
|
||||
# because the weights are zero.
|
||||
outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params))
|
||||
print("-" * 50)
|
||||
for output in outputs_updated:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
|
||||
print("-" * 50)
|
||||
@@ -1,168 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import gc
|
||||
from collections.abc import Callable
|
||||
from typing import TypedDict
|
||||
|
||||
import torch
|
||||
import zmq
|
||||
|
||||
|
||||
def stateless_init_process_group(master_address, master_port, rank, world_size, device):
|
||||
"""
|
||||
vLLM provides `StatelessProcessGroup` to create a process group
|
||||
without considering the global process group in torch.distributed.
|
||||
It is recommended to create `StatelessProcessGroup`, and then initialize
|
||||
the data-plane communication (NCCL) between external (train processes)
|
||||
and vLLM workers.
|
||||
"""
|
||||
from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
|
||||
from vllm.distributed.utils import StatelessProcessGroup
|
||||
|
||||
pg = StatelessProcessGroup.create(
|
||||
host=master_address, port=master_port, rank=rank, world_size=world_size
|
||||
)
|
||||
pynccl = PyNcclCommunicator(pg, device=device)
|
||||
return pynccl
|
||||
|
||||
|
||||
class WorkerExtension:
|
||||
"""
|
||||
The class for vLLM's worker to inherit from.
|
||||
By defining an extension class, the code can work no matter what is
|
||||
the underlying worker class.
|
||||
|
||||
NOTE: we define this class in a separate module, and the main module
|
||||
should pass the full qualified name as `worker_extension_cls` argument.
|
||||
"""
|
||||
|
||||
def init_weight_update_group(
|
||||
self, master_address, master_port, rank_offset, world_size
|
||||
):
|
||||
from vllm.distributed.parallel_state import get_world_group
|
||||
|
||||
rank = get_world_group().rank + rank_offset
|
||||
self.model_update_group = stateless_init_process_group(
|
||||
master_address,
|
||||
master_port,
|
||||
rank,
|
||||
world_size,
|
||||
self.device,
|
||||
)
|
||||
|
||||
def update_weight(self, name, dtype_name, shape):
|
||||
dtype = getattr(torch, dtype_name)
|
||||
weight = torch.empty(shape, dtype=dtype, device="cuda")
|
||||
self.model_update_group.broadcast(
|
||||
weight, src=0, stream=torch.cuda.current_stream()
|
||||
)
|
||||
|
||||
self.model_runner.model.load_weights(weights=[(name, weight)])
|
||||
|
||||
del weight
|
||||
|
||||
def check_weights_changed(self):
|
||||
"""
|
||||
Check if the weights are updated to 0.
|
||||
"""
|
||||
weights_updated = True
|
||||
for name, p in self.model_runner.model.named_parameters():
|
||||
weights_updated = weights_updated and torch.allclose(p, torch.zeros_like(p))
|
||||
return weights_updated
|
||||
|
||||
|
||||
def rebuild_ipc(
|
||||
handle: tuple[Callable, tuple], device_id: int | None = None
|
||||
) -> torch.Tensor:
|
||||
func, args = handle
|
||||
list_args = list(args)
|
||||
if device_id is not None:
|
||||
# the key is to change device id to the current device id
|
||||
# in case two processes have different CUDA_VISIBLE_DEVICES
|
||||
list_args[6] = device_id
|
||||
buffer = func(*list_args)
|
||||
return buffer
|
||||
|
||||
|
||||
class FlattenedTensorMetadata(TypedDict):
|
||||
name: str
|
||||
shape: torch.Size
|
||||
dtype: torch.dtype
|
||||
# specify the start offset of this tensor in shared ipc_buffer tensor
|
||||
offset: int
|
||||
|
||||
|
||||
class ColocateWorkerExtension:
|
||||
"""
|
||||
The class for vLLM's worker to inherit from, in the colocate setting.
|
||||
By defining an extension class, the code can work no matter what is
|
||||
the underlying worker class.
|
||||
|
||||
NOTE: we define this class in a separate module, and the main module
|
||||
should pass the full qualified name as `worker_extension_cls` argument.
|
||||
"""
|
||||
|
||||
def update_weights_from_ipc(self, zmq_handles: dict[str, str]):
|
||||
from vllm.model_executor.model_loader.utils import process_weights_after_loading
|
||||
|
||||
assert self.device is not None
|
||||
if not hasattr(self, "_zmq_ctx") or self._zmq_ctx is None:
|
||||
self._zmq_ctx = zmq.Context()
|
||||
socket = self._zmq_ctx.socket(zmq.REP)
|
||||
socket.connect(zmq_handles[self.report_device_id()])
|
||||
buffer: torch.Tensor | None = None
|
||||
while True:
|
||||
payload: tuple[Callable, tuple] | list[FlattenedTensorMetadata] | None = (
|
||||
socket.recv_pyobj()
|
||||
)
|
||||
if payload is None:
|
||||
# means the update is done
|
||||
process_weights_after_loading(
|
||||
self.model_runner.model, self.model_config, self.device
|
||||
)
|
||||
torch.accelerator.synchronize()
|
||||
socket.send(b"")
|
||||
break
|
||||
if isinstance(payload, tuple):
|
||||
# an ipc handle that vLLM can use `func, args = handle`
|
||||
# and `func(*args)` to rebuild GPU tensor.
|
||||
buffer = rebuild_ipc(payload, self.device.index)
|
||||
assert buffer.dtype == torch.uint8
|
||||
socket.send(b"")
|
||||
continue
|
||||
assert isinstance(payload, list)
|
||||
assert buffer is not None
|
||||
weights = []
|
||||
for item in payload:
|
||||
shape = item["shape"]
|
||||
if isinstance(shape, (list, tuple)):
|
||||
shape = torch.Size(shape)
|
||||
assert isinstance(shape, torch.Size)
|
||||
dtype, offset = item["dtype"], item["offset"]
|
||||
size = dtype.itemsize * shape.numel()
|
||||
tensor = buffer[offset : offset + size].view(dtype=dtype).view(shape)
|
||||
weights.append((item["name"], tensor))
|
||||
self.model_runner.model.load_weights(weights=weights)
|
||||
del weights
|
||||
torch.accelerator.synchronize()
|
||||
socket.send(b"")
|
||||
|
||||
socket.close()
|
||||
del buffer
|
||||
gc.collect()
|
||||
torch.accelerator.empty_cache()
|
||||
|
||||
def report_device_id(self) -> str:
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
self.device_uuid = current_platform.get_device_uuid(self.device.index)
|
||||
return self.device_uuid
|
||||
|
||||
def check_weights_changed(self):
|
||||
"""
|
||||
Check if the weights are updated to 0.
|
||||
"""
|
||||
weights_updated = True
|
||||
for name, p in self.model_runner.model.named_parameters():
|
||||
weights_updated = weights_updated and torch.allclose(p, torch.zeros_like(p))
|
||||
return weights_updated
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user