Compare commits

..

10 Commits

Author SHA1 Message Date
Elvir Crnčević
89138b21cc [Bugfix] Zero-init MLA attention output buffers to prevent NaN from CUDA graph padding (#37442)
Signed-off-by: Elvir Crncevic <elvircrn@gmail.com>
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
Co-authored-by: Matthew Bonanni <mbonanni@redhat.com>
(cherry picked from commit ef2c4f778d)
2026-03-18 18:44:16 -07:00
JartX
6edd43de3c [Bugfix][ROCm] Fix worker startup OOM on ROCm by skipping unreliable cudagraph memory profiling (#36720)
Signed-off-by: JartX <sagformas@epdcenter.es>
(cherry picked from commit e8f9dbc369)
2026-03-18 18:43:52 -07:00
Andreas Karatzas
16c971dbc7 [CI] Fix PaddleOCR-VL HF test failure due to create_causal_mask API rename (#37328)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
(cherry picked from commit eaf7c9b976)
2026-03-18 11:04:33 -07:00
khluu
262ddd0d81 [cherry-pick][Bugfix] Fix EP weight filter breaking EPLB and NVFP4 accuracy #37322
Signed-off-by: khluu <khluu000@gmail.com>
2026-03-18 01:48:32 -07:00
Li, Jiang
e60c1674b3 [Bugfix] Avoid OpenMP thread reallocation in CPU torch compile (#37391)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
(cherry picked from commit 261801242f)
2026-03-18 01:41:42 -07:00
Roy Wang
faa80947f5 [Performance] Add --enable-ep-weight-filter CLI option (#37351)
Signed-off-by: esmeetu <jasonailu87@gmail.com>
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
(cherry picked from commit 761e0aa7a0)
2026-03-18 01:41:25 -07:00
Terry Gao
eeabf740bb [Custom Ops] Add functional + out variant for scaled_fp4_quant (#34389)
Signed-off-by: tianrengao <terrygao87@gmail.com>
(cherry picked from commit 3e6a1e1686)
2026-03-18 01:41:09 -07:00
Elvir Crnčević
cdcffafef8 Fix eplb nvfp4 experts hook (#37217)
Signed-off-by: Elvir Crncevic <elvircrn@gmail.com>
Signed-off-by: Elvir Crncevic <elvir@anthropic.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
(cherry picked from commit fd4d96302a)
2026-03-18 01:40:57 -07:00
Walter Beller-Morales
4d22667c32 [Feature][Frontend] add support for Cohere Embed v2 API (#37074)
Signed-off-by: walterbm <walter.beller.morales@gmail.com>
(cherry picked from commit 061980c36a)
2026-03-16 22:05:47 -07:00
Andreas Karatzas
1fe3932c8b [ROCm] Fix AttributeError for torch.compiler.skip_all_guards_unsafe on older PyTorch (#37219)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
(cherry picked from commit 54a62a79f7)
2026-03-16 21:03:49 -07:00
680 changed files with 18859 additions and 31921 deletions

View File

@@ -1 +0,0 @@
Qwen3-235B-A22B-Instruct-2507-FP8.yaml

View File

@@ -16,23 +16,6 @@ RAY_BASE_URL="https://raw.githubusercontent.com/ray-project/ray/master/python"
WORK_DIR=$(mktemp -d)
trap 'rm -rf "$WORK_DIR"' EXIT
# ── Detect PyTorch index URL ─────────────────────────────────────────────
if python3 -c "import torch; assert torch.version.hip" 2>/dev/null; then
ROCM_VER=$(python3 -c "import torch; print(torch.version.hip.rsplit('.', 1)[0])")
CANDIDATE_URL="https://download.pytorch.org/whl/rocm${ROCM_VER}"
if curl -fsSL --head "${CANDIDATE_URL}/" >/dev/null 2>&1; then
TORCH_INDEX_URL="${CANDIDATE_URL}"
else
echo ">>> WARNING: ROCm ${ROCM_VER} wheel index not found at ${CANDIDATE_URL}"
echo ">>> Falling back to default PyPI (resolution may be incomplete)"
TORCH_INDEX_URL=""
fi
else
TORCH_INDEX_URL="https://download.pytorch.org/whl/cu129"
fi
echo ">>> Using PyTorch index: ${TORCH_INDEX_URL:-PyPI default}"
# Fetch all Ray requirement files used in the LLM depset pipeline
echo ">>> Fetching Ray requirement files"
RAY_FILES=(
@@ -133,11 +116,6 @@ echo "============================================================"
echo ">>> Resolving: Can Ray generate compatible lock files?"
echo "============================================================"
EXTRA_INDEX_ARGS=()
if [[ -n "${TORCH_INDEX_URL}" ]]; then
EXTRA_INDEX_ARGS+=(--extra-index-url "${TORCH_INDEX_URL}")
fi
set +e
uv pip compile \
"${WORK_DIR}/requirements.txt" \
@@ -148,7 +126,7 @@ uv pip compile \
-c "${WORK_DIR}/vllm-constraints.txt" \
--python-version 3.12 \
--python-platform x86_64-manylinux_2_31 \
"${EXTRA_INDEX_ARGS[@]}" \
--extra-index-url https://download.pytorch.org/whl/cu129 \
--index-strategy unsafe-best-match \
--unsafe-package setuptools \
--unsafe-package ray \

View File

@@ -333,18 +333,15 @@ apply_rocm_test_overrides() {
# --- Entrypoint ignores ---
if [[ $cmds == *" entrypoints/openai "* ]]; then
cmds=${cmds//" entrypoints/openai "/" entrypoints/openai \
--ignore=entrypoints/openai/chat_completion/test_audio.py \
--ignore=entrypoints/openai/completion/test_shutdown.py \
--ignore=entrypoints/openai/test_audio.py \
--ignore=entrypoints/openai/test_shutdown.py \
--ignore=entrypoints/openai/test_completion.py \
--ignore=entrypoints/openai/models/test_models.py \
--ignore=entrypoints/openai/test_models.py \
--ignore=entrypoints/openai/test_lora_adapters.py \
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
--ignore=entrypoints/openai/chat_completion/test_root_path.py \
--ignore=entrypoints/openai/completion/test_prompt_validation.py "}
fi
if [[ $cmds == *" entrypoints/serve"* ]]; then
cmds="${cmds} \
--ignore=entrypoints/serve/lora/test_lora_adapters.py"
--ignore=entrypoints/openai/test_root_path.py \
--ignore=entrypoints/openai/test_tokenization.py \
--ignore=entrypoints/openai/test_prompt_validation.py "}
fi
if [[ $cmds == *" entrypoints/llm "* ]]; then

View File

@@ -127,7 +127,7 @@ run_and_track_test() {
# --- Actual Test Execution ---
run_and_track_test 1 "test_struct_output_generate.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
run_and_track_test 2 "test_moe_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
run_and_track_test 3 "test_lora.py" \

View File

@@ -33,22 +33,23 @@ docker run \
bash -c '
set -e
echo $ZE_AFFINITY_MASK
pip install tblib==3.1.0
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager --max-model-len 8192
python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
cd tests
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
pytest -v -s v1/engine
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py --ignore=v1/worker/test_worker_memory_snapshot.py
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
pytest -v -s v1/structured_output
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py -k "not (test_register_kv_caches and FLASH_ATTN and True)"
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
pytest -v -s v1/test_serial_utils.py
'

View File

@@ -1,14 +1,11 @@
#!/usr/bin/env bash
set -euxo pipefail
# Nightly e2e test for prefetch offloading with a MoE model.
# Runs DeepSeek-V2-Lite with prefetch offloading of MoE expert weights
# and validates GSM8K accuracy matches baseline (no offloading).
#
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
#
# Environment variables:
# ATTENTION_BACKEND - attention backend to use (e.g., FLASH_ATTN,
# ROCM_ATTN, FLASHINFER). If unset, uses vllm default.
THRESHOLD=${1:-0.25}
NUM_Q=${2:-1319}
PORT=${3:-8030}
@@ -25,14 +22,6 @@ wait_for_server() {
MODEL="deepseek-ai/DeepSeek-V2-Lite"
# ── Build optional vllm serve flags ─────────────────────────────────────
EXTRA_ARGS=()
if [[ -n "${ATTENTION_BACKEND:-}" ]]; then
echo "Using attention backend: ${ATTENTION_BACKEND}"
EXTRA_ARGS+=(--attention-backend "${ATTENTION_BACKEND}")
fi
cleanup() {
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
kill "${SERVER_PID}" 2>/dev/null || true
@@ -51,8 +40,7 @@ vllm serve "$MODEL" \
--offload-num-in-group 2 \
--offload-prefetch-step 1 \
--offload-params w13_weight w2_weight \
--port "$PORT" \
${EXTRA_ARGS+"${EXTRA_ARGS[@]}"} &
--port "$PORT" &
SERVER_PID=$!
wait_for_server "$PORT"

File diff suppressed because it is too large Load Diff

View File

@@ -59,7 +59,7 @@ steps:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
- pytest -s -v tests/compile/passes/distributed
- label: Fusion and Compile Unit Tests (2xB200)
- label: Fusion and Compile Unit Tests (B200)
timeout_in_minutes: 20
working_dir: "/vllm-workspace/"
device: b200

View File

@@ -15,29 +15,8 @@ steps:
- pytest -v -s distributed/test_shm_buffer.py
- pytest -v -s distributed/test_shm_storage.py
- label: Distributed DP Tests (2 GPUs)
timeout_in_minutes: 20
working_dir: "/vllm-workspace/tests"
num_devices: 2
source_file_dependencies:
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- vllm/worker/worker_base.py
- vllm/v1/engine/
- vllm/v1/worker/
- tests/v1/distributed
- tests/entrypoints/openai/test_multi_api_servers.py
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- DP_SIZE=2 pytest -v -s entrypoints/openai/test_multi_api_servers.py
- label: Distributed Compile + RPC Tests (2 GPUs)
timeout_in_minutes: 20
- label: Distributed (2 GPUs)
timeout_in_minutes: 60
working_dir: "/vllm-workspace/tests"
num_devices: 2
source_file_dependencies:
@@ -50,31 +29,22 @@ steps:
- vllm/v1/worker/
- tests/compile/fullgraph/test_basic_correctness.py
- tests/compile/test_wrapper.py
- tests/entrypoints/llm/test_collective_rpc.py
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
- pytest -v -s entrypoints/llm/test_collective_rpc.py
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py
- label: Distributed Torchrun + Shutdown Tests (2 GPUs)
timeout_in_minutes: 20
working_dir: "/vllm-workspace/tests"
num_devices: 2
source_file_dependencies:
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- vllm/worker/worker_base.py
- vllm/v1/engine/
- vllm/v1/worker/
- tests/distributed/
- tests/entrypoints/llm/test_collective_rpc.py
- tests/v1/distributed
- tests/v1/entrypoints/openai/test_multi_api_servers.py
- tests/v1/shutdown
- tests/v1/worker/test_worker_memory_snapshot.py
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
- pytest -v -s entrypoints/llm/test_collective_rpc.py
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
@@ -82,35 +52,41 @@ steps:
- label: Distributed Torchrun + Examples (4 GPUs)
timeout_in_minutes: 30
working_dir: "/vllm-workspace"
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:
- vllm/distributed/
- tests/distributed/test_torchrun_example.py
- tests/distributed/test_torchrun_example_moe.py
- examples/offline_inference/rlhf.py
- examples/offline_inference/rlhf_colocate.py
- examples/rl/
- examples/offline_inference/new_weight_syncing/
- tests/examples/offline_inference/data_parallel.py
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
# test with torchrun tp=2 and external_dp=2
- torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example.py
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
# test with torchrun tp=2 and pp=2
- PP_SIZE=2 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example.py
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
# test with torchrun tp=4 and dp=1
- TP_SIZE=4 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py
- TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with torchrun tp=2, pp=2 and dp=1
- PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py
- PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with torchrun tp=1 and dp=4 with ep
- DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py
- DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with torchrun tp=2 and dp=2 with ep
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with internal dp
- python3 examples/offline_inference/data_parallel.py --enforce-eager
# rlhf examples
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_nccl.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_ipc.py
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
# OLD rlhf examples
- cd ../examples/offline_inference
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
# NEW rlhf examples
- cd new_weight_syncing
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
- label: Distributed DP Tests (4 GPUs)
timeout_in_minutes: 30
@@ -193,7 +169,7 @@ steps:
num_devices: 2
commands:
- pytest -v -s tests/distributed/test_context_parallel.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_async_new_apis.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py
- VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
- pytest -v -s tests/v1/distributed/test_dbo.py

View File

@@ -70,15 +70,3 @@ steps:
device: mi325_4
depends_on:
- image-build-amd
- label: V1 e2e (4xH100)
timeout_in_minutes: 60
device: h100
num_devices: 4
optional: true
source_file_dependencies:
- vllm/v1/attention/backends/utils.py
- vllm/v1/worker/gpu_model_runner.py
- tests/v1/e2e/test_hybrid_chunked_prefill.py
commands:
- pytest -v -s v1/e2e/test_hybrid_chunked_prefill.py

View File

@@ -10,7 +10,7 @@ steps:
- tests/entrypoints/
commands:
- pytest -v -s entrypoints/openai/tool_parsers
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/serve/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
- label: Entrypoints Integration (LLM)
timeout_in_minutes: 40
@@ -34,7 +34,7 @@ steps:
- tests/entrypoints/test_chat_utils
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses --ignore=entrypoints/openai/test_multi_api_servers.py
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
- pytest -v -s entrypoints/test_chat_utils.py
mirror:
amd:
@@ -48,11 +48,11 @@ steps:
source_file_dependencies:
- vllm/
- tests/entrypoints/rpc
- tests/entrypoints/serve/instrumentator
- tests/entrypoints/instrumentator
- tests/tool_use
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/serve/instrumentator
- pytest -v -s entrypoints/instrumentator
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
- pytest -v -s tool_use
@@ -75,6 +75,19 @@ steps:
commands:
- pytest -v -s entrypoints/openai/responses
- label: Entrypoints V1
timeout_in_minutes: 50
source_file_dependencies:
- vllm/
- tests/v1
commands:
- pytest -v -s v1/entrypoints
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: OpenAI API Correctness
timeout_in_minutes: 30
source_file_dependencies:

View File

@@ -24,7 +24,8 @@ steps:
- label: Elastic EP Scaling Test
timeout_in_minutes: 20
device: h100
device: b200
optional: true
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:

View File

@@ -35,7 +35,7 @@ steps:
parallelism: 2
- label: Kernels MoE Test %N
timeout_in_minutes: 25
timeout_in_minutes: 60
source_file_dependencies:
- csrc/quantization/cutlass_w8a8/moe/
- csrc/moe/
@@ -47,7 +47,7 @@ steps:
commands:
- pytest -v -s kernels/moe --ignore=kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
- pytest -v -s kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 5
parallelism: 2
- label: Kernels Mamba Test
timeout_in_minutes: 45

View File

@@ -45,22 +45,6 @@ steps:
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
- label: LM Eval Qwen3.5 Models (B200)
timeout_in_minutes: 120
device: b200
optional: true
num_devices: 2
source_file_dependencies:
- vllm/model_executor/models/qwen3_5.py
- vllm/model_executor/models/qwen3_5_mtp.py
- vllm/transformers_utils/configs/qwen3_5.py
- vllm/transformers_utils/configs/qwen3_5_moe.py
- vllm/model_executor/models/qwen3_next.py
- vllm/model_executor/models/qwen3_next_mtp.py
- vllm/model_executor/layers/fla/ops/
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-qwen35-blackwell.txt
- label: LM Eval Large Models (H200)
timeout_in_minutes: 60
device: h200

View File

@@ -8,7 +8,7 @@ steps:
- vllm/lora
- tests/lora
commands:
- pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py --ignore=lora/test_qwen35_densemoel_lora.py
- pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py
parallelism: 4
@@ -30,5 +30,4 @@ steps:
- pytest -v -s -x lora/test_llama_tp.py
- pytest -v -s -x lora/test_llm_with_multi_loras.py
- pytest -v -s -x lora/test_olmoe_tp.py
- pytest -v -s -x lora/test_gptoss_tp.py
- pytest -v -s -x lora/test_qwen35_densemoel_lora.py
- pytest -v -s -x lora/test_gptoss_tp.py

View File

@@ -9,9 +9,9 @@ steps:
- vllm/config/model.py
- vllm/model_executor
- tests/model_executor
- tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
commands:
- apt-get update && apt-get install -y curl libsodium23
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s model_executor
- pytest -v -s entrypoints/openai/completion/test_tensorizer_entrypoint.py
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py

View File

@@ -11,7 +11,7 @@ steps:
- vllm/v1/attention/
- tests/v1/engine/test_llm_engine.py
- tests/v1/e2e/
- tests/entrypoints/llm/test_struct_output_generate.py
- tests/v1/entrypoints/llm/test_struct_output_generate.py
commands:
- set -x
- export VLLM_USE_V2_MODEL_RUNNER=1
@@ -22,7 +22,7 @@ steps:
- pytest -v -s v1/e2e/general/test_context_length.py
- pytest -v -s v1/e2e/general/test_min_tokens.py
# Temporary hack filter to exclude ngram spec decoding based tests.
- pytest -v -s entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0"
- pytest -v -s v1/entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0"
- label: Model Runner V2 Examples
timeout_in_minutes: 45

View File

@@ -62,7 +62,7 @@ steps:
depends_on:
- image-build-amd
- label: Multi-Modal Processor (CPU)
- label: Multi-Modal Processor Test (CPU)
depends_on:
- image-build-cpu
timeout_in_minutes: 60
@@ -95,44 +95,34 @@ steps:
commands:
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
- label: Multi-Modal Models (Extended Generation 1)
- label: Multi-Modal Models (Extended) 1
optional: true
source_file_dependencies:
- vllm/
- tests/models/multimodal/generation
- tests/models/multimodal/test_mapping.py
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation -m 'not core_model' --ignore models/multimodal/generation/test_common.py
- pytest -v -s models/multimodal/test_mapping.py
- pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: Multi-Modal Models (Extended Generation 2)
- label: Multi-Modal Models (Extended) 2
optional: true
source_file_dependencies:
- vllm/
- tests/models/multimodal/generation
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
- label: Multi-Modal Models (Extended Generation 3)
- label: Multi-Modal Models (Extended) 3
optional: true
source_file_dependencies:
- vllm/
- tests/models/multimodal/generation
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
- label: Multi-Modal Models (Extended Pooling)
optional: true
source_file_dependencies:
- vllm/
- tests/models/multimodal/pooling
commands:
- pytest -v -s models/multimodal/pooling -m 'not core_model'

View File

@@ -36,6 +36,6 @@ steps:
- pytest -v -s plugins_tests/test_scheduler_plugins.py
- pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s distributed/test_distributed_oot.py
- pytest -v -s entrypoints/openai/chat_completion/test_oot_registration.py # it needs a clean process
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
- pytest -v -s models/test_oot_registration.py # it needs a clean process
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins

View File

@@ -35,7 +35,7 @@ steps:
# as it is a heavy test that is covered in other steps.
# Use `find` to launch multiple instances of pytest so that
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"
- label: PyTorch Fullgraph
timeout_in_minutes: 30

3
.github/CODEOWNERS vendored
View File

@@ -75,7 +75,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 @pavanimajety
/tests/test_inputs.py @DarkLight1337 @ywang96
/tests/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
/tests/v1/structured_output @mgoin @russellb @aarnphm
/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
/tests/weight_loading @mgoin @youkaichao @yewentao256
@@ -171,7 +171,6 @@ mkdocs.yaml @hmellor
# Pooling models
/examples/pooling @noooop
/docs/models/pooling_models @noooop
/tests/models/*/pooling* @noooop
/tests/entrypoints/pooling @noooop
/vllm/config/pooler.py @noooop

11
.github/mergify.yml vendored
View File

@@ -260,7 +260,7 @@ pull_request_rules:
- files=examples/offline_inference/structured_outputs.py
- files=examples/online_serving/structured_outputs/structured_outputs.py
- files~=^tests/v1/structured_output/
- files=tests/entrypoints/llm/test_struct_output_generate.py
- files=tests/v1/entrypoints/llm/test_struct_output_generate.py
- files~=^vllm/v1/structured_output/
actions:
label:
@@ -333,10 +333,9 @@ pull_request_rules:
- label != stale
- or:
- files~=^tests/tool_use/
- files~=^tests/tool_parsers/
- files~=^tests/entrypoints/openai/.*tool.*
- files~=^tests/entrypoints/anthropic/.*tool.*
- files~=^vllm/tool_parsers/
- files~=^tests/entrypoints/openai/tool_parsers/
- files=tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py
- files~=^vllm/entrypoints/openai/tool_parsers/
- files=docs/features/tool_calling.md
- files~=^examples/tool_chat_*
- files=examples/offline_inference/chat_with_tools.py
@@ -382,7 +381,7 @@ pull_request_rules:
- or:
- files~=^vllm/model_executor/model_loader/tensorizer.py
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
- files~=^tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
- files~=^tests/model_executor/model_loader/tensorizer_loader/
actions:
assign:

50
.github/scripts/cleanup_pr_body.sh vendored Executable file
View File

@@ -0,0 +1,50 @@
#!/bin/bash
set -eu
# ensure 1 argument is passed
if [ "$#" -ne 1 ]; then
echo "Usage: $0 <pr_number>"
exit 1
fi
PR_NUMBER=$1
OLD=/tmp/orig_pr_body.txt
NEW=/tmp/new_pr_body.txt
gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
cp "${OLD}" "${NEW}"
# Remove markdown comments (like the <!-- markdownlint-disable --> at the start)
sed -i '/<!--.*-->$/d' "${NEW}"
# Remove "PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED."
sed -i '/PLEASE FILL IN THE PR DESCRIPTION HERE.*$/d' "${NEW}"
# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
# Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
python3 - <<EOF
import regex as re
with open("${NEW}", "r") as file:
content = file.read()
pattern = re.compile(r'(---\n\n)?<details>.*?<summary>.*?PR Checklist \(Click to Expand\).*?</summary>.*?</details>', re.DOTALL)
content = re.sub(pattern, '', content)
with open("${NEW}", "w") as file:
file.write(content)
EOF
# Run this only if ${NEW} is different than ${OLD}
if ! cmp -s "${OLD}" "${NEW}"; then
gh pr edit --body-file "${NEW}" "${PR_NUMBER}"
echo
echo "Updated PR body:"
echo
cat "${NEW}"
else
echo "No changes needed"
fi

32
.github/workflows/cleanup_pr_body.yml vendored Normal file
View File

@@ -0,0 +1,32 @@
name: Cleanup PR Body
on:
pull_request_target:
types: [opened, reopened, edited]
permissions:
pull-requests: write
jobs:
update-description:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
- name: Set up Python
uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
with:
python-version: '3.12'
cache: 'pip'
- name: Install Python dependencies
run: |
python3 -m pip install --upgrade pip
python3 -m pip install regex
- name: Update PR description
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: bash .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"

View File

@@ -383,107 +383,4 @@ jobs:
core.notice(`All users for label "${label}" already mentioned, skipping comment`);
}
}
}
- name: Request missing ROCm info from issue author
if: contains(steps.label-step.outputs.labels_added, 'rocm') && contains(toJSON(github.event.issue.labels.*.name), 'bug')
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
const body = (context.payload.issue.body || '').toLowerCase();
// Check for existing bot comments to avoid duplicate requests
const comments = await github.rest.issues.listComments({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
});
const botAlreadyAsked = comments.data.some(
c => c.user.type === 'Bot' && c.body.includes('<!-- rocm-info-request -->')
);
if (botAlreadyAsked) {
core.notice('ROCm info request already posted, skipping');
return;
}
// Define required information and detection patterns
const requiredInfo = [
{
name: 'Reproducer',
patterns: [
/reproduc/i, /minimal.?example/i, /repro\b/i, /steps to reproduce/i,
/code.?snippet/i, /sample.?code/i,
/```python[\s\S]*?```/, /```bash[\s\S]*?```/, /```sh[\s\S]*?```/,
],
ask: 'A minimal reproducer (code snippet or script that triggers the issue)',
},
{
name: 'Error message',
patterns: [
/error/i, /traceback/i, /exception/i, /fault/i, /crash/i,
/failed/i, /abort/i, /panic/i,
],
ask: 'The full error message or traceback',
},
{
name: 'Installation method',
patterns: [
/docker/i, /rocm\/pytorch/i, /dockerfile/i, /from source/i,
/pip install/i, /build.?from/i, /container/i, /image/i,
/wheel/i, /\.whl/i, /nightly/i,
],
ask: 'How you installed vLLM (Docker image name, pip install, or build from source steps)',
},
{
name: 'Command',
patterns: [
/vllm serve/i, /python\s+\S+\.py/i, /```bash[\s\S]*?```/,
/```sh[\s\S]*?```/, /command/i, /launch/i, /run\s/i,
/--model/i, /--tensor-parallel/i, /--gpu-memory/i,
],
ask: 'The command you used to launch vLLM (e.g., `vllm serve ...` or the Python script)',
},
{
name: 'GFX architecture',
patterns: [
/gfx\d{3,4}/i, /mi\d{3}/i, /mi\d{2}\b/i, /radeon/i,
/gpu.?arch/i, /rocm-smi/i, /rocminfo/i, /navi/i,
/instinct/i,
],
ask: 'Your GPU model and GFX architecture (e.g., MI300X / gfx942) — run `rocminfo | grep gfx`',
},
];
const issueBody = context.payload.issue.body || '';
const missing = requiredInfo.filter(info =>
!info.patterns.some(p => p.test(issueBody))
);
if (missing.length === 0) {
core.notice('All required ROCm info appears to be present');
return;
}
const author = context.payload.issue.user.login;
const checklist = requiredInfo.map(info => {
const found = !missing.includes(info);
return `- [${found ? 'x' : ' '}] ${info.ask}`;
}).join('\n');
const message = [
'<!-- rocm-info-request -->',
`Hi @${author}, thanks for reporting this ROCm issue!`,
'',
'To help us investigate, please make sure the following information is included:',
'',
checklist,
'',
'Please provide any unchecked items above. This will help us reproduce and resolve the issue faster. Thank you!',
].join('\n');
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body: message,
});
core.notice(`Requested missing ROCm info from @${author}: ${missing.map(m => m.name).join(', ')}`);
}

View File

@@ -1,9 +1,9 @@
name: macOS Apple Silicon Smoke Test
on:
schedule:
# Daily at 2:30 AM UTC
- cron: '30 2 * * *'
push:
branches:
- main
workflow_dispatch: # Manual trigger
permissions:

View File

@@ -1,96 +0,0 @@
name: New PR Bot
on:
pull_request_target:
types: [opened]
permissions:
pull-requests: write
jobs:
update-description:
runs-on: ubuntu-latest
steps:
- name: Update PR description
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
const { owner, repo } = context.repo;
const pr_number = context.issue.number;
const { data: pr } = await github.rest.pulls.get({
owner,
repo,
pull_number: pr_number,
});
let body = pr.body || '';
const original = body;
// Remove markdown comments (<!-- ... -->)
body = body.replace(/^<!--.*-->$/gm, '');
// Remove "PLEASE FILL IN THE PR DESCRIPTION HERE ..."
body = body.replace(/^PLEASE FILL IN THE PR DESCRIPTION HERE.*$/gm, '');
// Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ ..."
body = body.replace(/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*[\s\S]*$/, '');
// Remove <details> section containing "PR Checklist (Click to Expand)"
body = body.replace(/(---\n\n)?<details>[\s\S]*?<summary>[\s\S]*?PR Checklist \(Click to Expand\)[\s\S]*?<\/summary>[\s\S]*?<\/details>/g, '');
if (body !== original) {
await github.rest.pulls.update({
owner,
repo,
pull_number: pr_number,
body,
});
console.log('Updated PR body');
} else {
console.log('No changes needed');
}
reminder-comment:
runs-on: ubuntu-latest
steps:
- name: Post welcome comment for first-time contributors
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
const { owner, repo } = context.repo;
const prAuthor = context.payload.pull_request.user.login;
const { data: searchResults } = await github.rest.search.issuesAndPullRequests({
q: `repo:${owner}/${repo} type:pr author:${prAuthor}`,
per_page: 1,
});
const authorPRCount = searchResults.total_count;
console.log(`Found ${authorPRCount} PRs by ${prAuthor}`);
if (authorPRCount === 1) {
console.log(`Posting welcome comment for first-time contributor: ${prAuthor}`);
await github.rest.issues.createComment({
owner,
repo,
issue_number: context.issue.number,
body: [
'\u{1f44b} Hi! Thank you for contributing to the vLLM project.',
'',
'\u{1f4ac} Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.',
'',
'Just a reminder: PRs would not trigger full CI run by default.',
'',
'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.',
'',
'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.',
'',
'If you have any questions, please reach out to us on Slack at https://slack.vllm.ai.',
'',
'\u{1f680}',
].join('\n'),
});
} else {
console.log(`Skipping comment for ${prAuthor} - not their first PR (${authorPRCount} PRs found)`);
}

View File

@@ -11,39 +11,9 @@ concurrency:
permissions:
contents: read
pull-requests: read
jobs:
pre-run-check:
if: github.event_name == 'pull_request'
runs-on: ubuntu-latest
steps:
- name: Check PR label and author merge count
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
const { data: pr } = await github.rest.pulls.get({
...context.repo,
pull_number: context.payload.pull_request.number,
});
const hasReadyLabel = pr.labels.some(l => l.name === 'ready');
const { data: mergedPRs } = await github.rest.search.issuesAndPullRequests({
q: `repo:${context.repo.owner}/${context.repo.repo} is:pr is:merged author:${pr.user.login}`,
per_page: 4,
});
const mergedCount = mergedPRs.total_count;
if (hasReadyLabel || mergedCount >= 4) {
core.info(`Check passed: ready label=${hasReadyLabel}, 4+ merged PRs=${mergedCount >= 4}`);
} else {
core.setFailed(`PR must have the 'ready' label or the author must have at least 4 merged PRs (found ${mergedCount}).`);
}
pre-commit:
needs: pre-run-check
if: always() && (needs.pre-run-check.result == 'success' || needs.pre-run-check.result == 'skipped')
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1

54
.github/workflows/reminder_comment.yml vendored Normal file
View File

@@ -0,0 +1,54 @@
name: PR Reminder Comment Bot
permissions:
pull-requests: write
on:
pull_request_target:
types: [opened]
jobs:
pr_reminder:
runs-on: ubuntu-latest
steps:
- name: Remind to run full CI on PR
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
try {
// Get the PR author
const prAuthor = context.payload.pull_request.user.login;
// Check if this is the author's first PR in this repository
// Use GitHub's search API to find all PRs by this author
const { data: searchResults } = await github.rest.search.issuesAndPullRequests({
q: `repo:${context.repo.owner}/${context.repo.repo} type:pr author:${prAuthor}`,
per_page: 100
});
const authorPRCount = searchResults.total_count;
console.log(`Found ${authorPRCount} PRs by ${prAuthor}`);
// Only post comment if this is the first PR (only one PR by this author)
if (authorPRCount === 1) {
console.log(`Posting welcome comment for first-time contributor: ${prAuthor}`);
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
'💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. \n\n' +
'You ask your reviewers to trigger select CI tests on top of `fastcheck` CI. \n\n' +
'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
'If you have any questions, please reach out to us on Slack at https://slack.vllm.ai.\n\n' +
'🚀'
});
} else {
console.log(`Skipping comment for ${prAuthor} - not their first PR (${authorPRCount} PRs found)`);
}
} catch (error) {
console.error('Error checking PR history or posting comment:', error);
// Don't fail the workflow, just log the error
}
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

View File

@@ -340,6 +340,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_EXT_SRC
"csrc/quantization/awq/gemm_kernels.cu"
"csrc/permute_cols.cu"
"csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu"
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
@@ -985,48 +986,6 @@ define_extension_target(
# Setting this variable sidesteps the issue by calling the driver directly.
target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
# add OR VLLM_GPU_LANG STREQUAL "HIP" here once
# https://github.com/vllm-project/vllm/issues/35163 is resolved
if(VLLM_GPU_LANG STREQUAL "CUDA")
#
# _C_stable_libtorch extension (ops registered via STABLE_TORCH_LIBRARY)
#
set(VLLM_STABLE_EXT_SRC
"csrc/libtorch_stable/torch_bindings.cpp")
if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_STABLE_EXT_SRC "csrc/libtorch_stable/permute_cols.cu")
endif()
if(VLLM_GPU_LANG STREQUAL "CUDA")
set_gencode_flags_for_srcs(
SRCS "${VLLM_STABLE_EXT_SRC}"
CUDA_ARCHS "${CUDA_ARCHS}")
endif()
message(STATUS "Enabling C_stable extension.")
define_extension_target(
_C_stable_libtorch
DESTINATION vllm
LANGUAGE ${VLLM_GPU_LANG}
SOURCES ${VLLM_STABLE_EXT_SRC}
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES}
USE_SABI 3
WITH_SOABI)
# Set TORCH_TARGET_VERSION for stable ABI compatibility.
# This ensures we only use C-shim APIs available in PyTorch 2.10.
# _C_stable_libtorch is abi compatible with PyTorch >= TORCH_TARGET_VERSION
# which is currently set to 2.10.
target_compile_definitions(_C_stable_libtorch PRIVATE
TORCH_TARGET_VERSION=0x020A000000000000ULL)
# Needed to use cuda APIs from C-shim
target_compile_definitions(_C_stable_libtorch PRIVATE
USE_CUDA)
endif()
#
# _moe_C extension
#
@@ -1040,7 +999,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_MOE_EXT_SRC
"csrc/moe/moe_wna16.cu"
"csrc/moe/grouped_topk_kernels.cu"
"csrc/moe/gpt_oss_router_gemm.cu"
"csrc/moe/router_gemm.cu")
endif()

View File

@@ -47,8 +47,6 @@ from common import (
is_mla_backend,
)
from vllm.v1.worker.workspace import init_workspace_manager
def run_standard_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
"""Run standard attention benchmark (Flash/Triton/FlashInfer)."""
@@ -464,7 +462,7 @@ def main():
parser.add_argument(
"--batch-specs",
nargs="+",
default=None,
default=["q2k", "8q1s1k"],
help="Batch specifications using extended grammar",
)
@@ -480,21 +478,6 @@ def main():
parser.add_argument("--repeats", type=int, default=1, help="Repetitions")
parser.add_argument("--warmup-iters", type=int, default=3, help="Warmup iterations")
parser.add_argument("--profile-memory", action="store_true", help="Profile memory")
parser.add_argument(
"--kv-cache-dtype",
default="auto",
choices=["auto", "fp8"],
help="KV cache dtype: auto or fp8",
)
parser.add_argument(
"--cuda-graphs",
action=argparse.BooleanOptionalAction,
default=True,
help=(
"Launch kernels with CUDA graphs to eliminate CPU overhead"
"in measurements (default: True)"
),
)
# Parameter sweep (use YAML config for advanced sweeps)
parser.add_argument(
@@ -553,24 +536,21 @@ def main():
# Batch specs and sizes
# Support both explicit batch_specs and generated batch_spec_ranges
# CLI --batch-specs takes precedence over YAML when provided.
cli_batch_specs_provided = args.batch_specs is not None
if not cli_batch_specs_provided:
if "batch_spec_ranges" in yaml_config:
# Generate batch specs from ranges
generated_specs = generate_batch_specs_from_ranges(
yaml_config["batch_spec_ranges"]
)
# Combine with any explicit batch_specs
if "batch_specs" in yaml_config:
args.batch_specs = yaml_config["batch_specs"] + generated_specs
else:
args.batch_specs = generated_specs
console.print(
f"[dim]Generated {len(generated_specs)} batch specs from ranges[/]"
)
elif "batch_specs" in yaml_config:
args.batch_specs = yaml_config["batch_specs"]
if "batch_spec_ranges" in yaml_config:
# Generate batch specs from ranges
generated_specs = generate_batch_specs_from_ranges(
yaml_config["batch_spec_ranges"]
)
# Combine with any explicit batch_specs
if "batch_specs" in yaml_config:
args.batch_specs = yaml_config["batch_specs"] + generated_specs
else:
args.batch_specs = generated_specs
console.print(
f"[dim]Generated {len(generated_specs)} batch specs from ranges[/]"
)
elif "batch_specs" in yaml_config:
args.batch_specs = yaml_config["batch_specs"]
if "batch_sizes" in yaml_config:
args.batch_sizes = yaml_config["batch_sizes"]
@@ -595,10 +575,6 @@ def main():
args.warmup_iters = yaml_config["warmup_iters"]
if "profile_memory" in yaml_config:
args.profile_memory = yaml_config["profile_memory"]
if "kv_cache_dtype" in yaml_config:
args.kv_cache_dtype = yaml_config["kv_cache_dtype"]
if "cuda_graphs" in yaml_config:
args.cuda_graphs = yaml_config["cuda_graphs"]
# Parameter sweep configuration
if "parameter_sweep" in yaml_config:
@@ -653,18 +629,12 @@ def main():
# Determine backends
backends = args.backends or ([args.backend] if args.backend else ["flash"])
prefill_backends = getattr(args, "prefill_backends", None)
if not args.batch_specs:
args.batch_specs = ["q2k", "8q1s1k"]
console.print(f"Backends: {', '.join(backends)}")
if prefill_backends:
console.print(f"Prefill backends: {', '.join(prefill_backends)}")
console.print(f"Batch specs: {', '.join(args.batch_specs)}")
console.print(f"KV cache dtype: {args.kv_cache_dtype}")
console.print(f"CUDA graphs: {args.cuda_graphs}")
console.print()
init_workspace_manager(args.device)
# Run benchmarks
all_results = []
@@ -717,8 +687,6 @@ def main():
repeats=args.repeats,
warmup_iters=args.warmup_iters,
profile_memory=args.profile_memory,
kv_cache_dtype=args.kv_cache_dtype,
use_cuda_graphs=args.cuda_graphs,
)
# Add decode pipeline config
@@ -871,8 +839,6 @@ def main():
"repeats": args.repeats,
"warmup_iters": args.warmup_iters,
"profile_memory": args.profile_memory,
"kv_cache_dtype": args.kv_cache_dtype,
"use_cuda_graphs": args.cuda_graphs,
}
all_results = run_model_parameter_sweep(
backends,
@@ -895,8 +861,6 @@ def main():
"repeats": args.repeats,
"warmup_iters": args.warmup_iters,
"profile_memory": args.profile_memory,
"kv_cache_dtype": args.kv_cache_dtype,
"use_cuda_graphs": args.cuda_graphs,
}
all_results = run_parameter_sweep(
backends, args.batch_specs, base_config_args, args.parameter_sweep, console
@@ -927,8 +891,6 @@ def main():
repeats=args.repeats,
warmup_iters=args.warmup_iters,
profile_memory=args.profile_memory,
kv_cache_dtype=args.kv_cache_dtype,
use_cuda_graphs=args.cuda_graphs,
)
result = run_benchmark(config)

View File

@@ -213,9 +213,6 @@ class BenchmarkConfig:
profile_memory: bool = False
use_cuda_graphs: bool = False
# "auto" or "fp8"
kv_cache_dtype: str = "auto"
# MLA-specific
prefill_backend: str | None = None
kv_lora_rank: int | None = None
@@ -372,7 +369,6 @@ class ResultsFormatter:
"backend",
"batch_spec",
"num_layers",
"kv_cache_dtype",
"mean_time",
"std_time",
"throughput",
@@ -386,7 +382,6 @@ class ResultsFormatter:
"backend": r.config.backend,
"batch_spec": r.config.batch_spec,
"num_layers": r.config.num_layers,
"kv_cache_dtype": r.config.kv_cache_dtype,
"mean_time": r.mean_time,
"std_time": r.std_time,
"throughput": r.throughput_tokens_per_sec or 0,

View File

@@ -30,9 +30,9 @@ batch_specs:
- "2q16k_32q1s4k" # 2 very large prefill + 32 decode
# Context extension + decode
- "2q1ks2k_16q1s1k" # 2 extend + 16 decode
- "4q2ks4k_32q1s2k" # 4 extend + 32 decode
- "2q1ks8k_32q1s2k" # 2 large extend + 32 decode
- "2q1kkv2k_16q1s1k" # 2 extend + 16 decode
- "4q2kkv4k_32q1s2k" # 4 extend + 32 decode
- "2q1kkv8k_32q1s2k" # 2 large extend + 32 decode
# Explicitly chunked prefill
- "q8k" # 8k prefill with chunking hint

View File

@@ -1,58 +0,0 @@
# MLA decode-only benchmark configuration
model:
name: "deepseek-v3"
num_layers: 60
num_q_heads: 128 # Base value, can be swept for TP simulation
num_kv_heads: 1 # MLA uses single latent KV
head_dim: 576
kv_lora_rank: 512
qk_nope_head_dim: 128
qk_rope_head_dim: 64
v_head_dim: 128
block_size: 128 # CUTLASS MLA and FlashAttn MLA use 128
# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
model_parameter_sweep:
param_name: "num_q_heads"
values: [128, 64, 32, 16]
label_format: "{backend}_{value}h"
batch_specs:
# Small batches, varying sequence lengths
- "16q1s512" # 16 requests, 512 KV cache
- "16q1s1k" # 16 requests, 1k KV cache
- "16q1s2k" # 16 requests, 2k KV cache
- "16q1s4k" # 16 requests, 4k KV cache
# Medium batches
- "32q1s1k" # 32 requests, 1k KV cache
- "32q1s2k" # 32 requests, 2k KV cache
- "32q1s4k" # 32 requests, 4k KV cache
- "32q1s8k" # 32 requests, 8k KV cache
# Large batches
- "64q1s1k" # 64 requests, 1k KV cache
- "64q1s2k" # 64 requests, 2k KV cache
- "64q1s4k" # 64 requests, 4k KV cache
- "64q1s8k" # 64 requests, 8k KV cache
# Very large batches
- "128q1s1k" # 128 requests, 1k KV cache
- "128q1s2k" # 128 requests, 2k KV cache
- "128q1s4k" # 128 requests, 4k KV cache
- "128q1s8k" # 128 requests, 8k KV cache
# Long context
- "32q1s16k" # 32 requests, 16k KV cache
- "32q1s32k" # 32 requests, 32k KV cache
backends:
- FLASHMLA_SPARSE
- FLASHINFER_MLA_SPARSE
device: "cuda:0"
repeats: 100
warmup_iters: 10
profile_memory: true

View File

@@ -60,11 +60,9 @@ def create_minimal_vllm_config(
model_name: str = "deepseek-v3",
block_size: int = 128,
max_num_seqs: int = 256,
max_num_batched_tokens: int = 8192,
mla_dims: dict | None = None,
index_topk: int | None = None,
prefill_backend: str | None = None,
kv_cache_dtype: str = "auto",
) -> VllmConfig:
"""
Create minimal VllmConfig for MLA benchmarks.
@@ -151,13 +149,13 @@ def create_minimal_vllm_config(
cache_config = CacheConfig(
block_size=block_size,
gpu_memory_utilization=0.9,
cache_dtype=kv_cache_dtype,
cache_dtype="auto",
enable_prefix_caching=False,
)
scheduler_config = SchedulerConfig(
max_num_seqs=max_num_seqs,
max_num_batched_tokens=max(max_num_batched_tokens, max_num_seqs),
max_num_batched_tokens=8192,
max_model_len=32768,
is_encoder_decoder=False,
enable_chunked_prefill=True,
@@ -537,7 +535,6 @@ def _create_backend_impl(
device: torch.device,
max_num_tokens: int = 8192,
index_topk: int | None = None,
kv_cache_dtype: str = "auto",
):
"""
Create backend implementation instance.
@@ -586,7 +583,7 @@ def _create_backend_impl(
"num_kv_heads": mla_dims["num_kv_heads"],
"alibi_slopes": None,
"sliding_window": None,
"kv_cache_dtype": kv_cache_dtype,
"kv_cache_dtype": "auto",
"logits_soft_cap": None,
"attn_type": "decoder",
"kv_sharing_target_layer_name": None,
@@ -704,7 +701,6 @@ def _run_single_benchmark(
mla_dims: dict,
device: torch.device,
indexer=None,
kv_cache_dtype: str | None = None,
) -> BenchmarkResult:
"""
Run a single benchmark iteration.
@@ -738,124 +734,49 @@ def _run_single_benchmark(
)
# Create KV cache
if kv_cache_dtype is None:
kv_cache_dtype = getattr(config, "kv_cache_dtype", "auto")
head_size = mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"]
if kv_cache_dtype == "fp8_ds_mla":
# FlashMLA sparse custom format: 656 bytes per token, stored as uint8.
# Layout: kv_lora_rank fp8 bytes + 4 float32 tile scales
# + 2*rope_dim bf16 bytes
# = 512 + 16 + 128 = 656 bytes for DeepSeek dims.
kv_cache = torch.zeros(
num_blocks,
block_size,
656,
device=device,
dtype=torch.uint8,
)
elif kv_cache_dtype == "fp8":
from vllm.platforms import current_platform
kv_cache = torch.zeros(
num_blocks,
block_size,
mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
device=device,
dtype=torch.bfloat16,
)
kv_cache = torch.zeros(
num_blocks,
block_size,
head_size,
device=device,
dtype=torch.uint8,
).view(current_platform.fp8_dtype())
else:
kv_cache = torch.zeros(
num_blocks,
block_size,
head_size,
device=device,
dtype=torch.bfloat16,
)
# Create input tensors for both decode and prefill modes
decode_inputs, prefill_inputs = _create_input_tensors(
total_q,
mla_dims,
backend_cfg["query_format"],
device,
torch.bfloat16,
)
# Fill indexer with random indices for sparse backends
is_sparse = backend_cfg.get("is_sparse", False)
if is_sparse and indexer is not None:
indexer.fill_random_indices(total_q, max_kv_len)
# Determine which forward methods to use based on metadata.
# Sparse MLA backends always use forward_mqa
has_decode = is_sparse or getattr(metadata, "decode", None) is not None
has_prefill = not is_sparse and getattr(metadata, "prefill", None) is not None
if not has_decode and not has_prefill:
# Determine which forward method to use based on metadata
if metadata.decode is not None:
forward_fn = lambda: impl.forward_mqa(decode_inputs, kv_cache, metadata, layer)
elif metadata.prefill is not None:
forward_fn = lambda: impl.forward_mha(
prefill_inputs["q"],
prefill_inputs["k_c_normed"],
prefill_inputs["k_pe"],
kv_cache,
metadata,
prefill_inputs["k_scale"],
prefill_inputs["output"],
)
else:
raise RuntimeError("Metadata has neither decode nor prefill metadata")
num_decode = (
metadata.num_decode_tokens
if (has_decode and has_prefill)
else total_q
if has_decode
else 0
)
num_prefill = total_q - num_decode
# Some backends requires fp8 queries when using fp8 KV cache.
is_fp8_kvcache = kv_cache_dtype.startswith("fp8")
quantize_query = is_fp8_kvcache and getattr(
impl, "supports_quant_query_input", False
)
# quantize_query forces concat format
query_fmt = "concat" if quantize_query else backend_cfg["query_format"]
# Create decode query tensors
if has_decode:
decode_inputs, _ = _create_input_tensors(
num_decode, mla_dims, query_fmt, device, torch.bfloat16
)
# Cast decode query to fp8 if the backend supports it
if quantize_query:
from vllm.platforms import current_platform
if isinstance(decode_inputs, tuple):
decode_inputs = torch.cat(list(decode_inputs), dim=-1)
decode_inputs = decode_inputs.to(current_platform.fp8_dtype())
# Create prefill input tensors
if has_prefill:
_, prefill_inputs = _create_input_tensors(
num_prefill, mla_dims, query_fmt, device, torch.bfloat16
)
# Build forward function
def forward_fn():
results = []
if has_decode:
results.append(impl.forward_mqa(decode_inputs, kv_cache, metadata, layer))
if has_prefill:
results.append(
impl.forward_mha(
prefill_inputs["q"],
prefill_inputs["k_c_normed"],
prefill_inputs["k_pe"],
kv_cache,
metadata,
prefill_inputs["k_scale"],
prefill_inputs["output"],
)
)
return results[0] if len(results) == 1 else tuple(results)
# Warmup
for _ in range(config.warmup_iters):
forward_fn()
torch.accelerator.synchronize()
# Optionally capture a CUDA graph after warmup.
# Graph replay eliminates CPU launch overhead so timings reflect pure
# kernel time.
if config.use_cuda_graphs:
graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(graph):
forward_fn()
benchmark_fn = graph.replay
else:
benchmark_fn = forward_fn
# Benchmark
times = []
for _ in range(config.repeats):
@@ -864,7 +785,7 @@ def _run_single_benchmark(
start.record()
for _ in range(config.num_layers):
benchmark_fn()
forward_fn()
end.record()
torch.accelerator.synchronize()
@@ -931,30 +852,13 @@ def _run_mla_benchmark_batched(
# Determine if this is a sparse backend
is_sparse = backend_cfg.get("is_sparse", False)
# Extract kv_cache_dtype from the first config
kv_cache_dtype = getattr(first_config, "kv_cache_dtype", "auto")
# FlashMLA sparse only supports "fp8_ds_mla" internally (not generic "fp8").
# Remap here so the user can pass --kv-cache-dtype fp8 regardless of backend.
if backend.upper() == "FLASHMLA_SPARSE" and kv_cache_dtype == "fp8":
kv_cache_dtype = "fp8_ds_mla"
# Compute max total_q across all configs so the metadata builder buffer
# and scheduler config are large enough for all batch specs.
max_total_q = max(
sum(r.q_len for r in parse_batch_spec(cfg.batch_spec))
for cfg, *_ in configs_with_params
)
# Create and set vLLM config for MLA (reused across all benchmarks)
vllm_config = create_minimal_vllm_config(
model_name="deepseek-v3", # Used only for model path
block_size=block_size,
max_num_batched_tokens=max_total_q,
mla_dims=mla_dims, # Use custom dims from config or default
index_topk=index_topk if is_sparse else None,
prefill_backend=prefill_backend,
kv_cache_dtype=kv_cache_dtype,
)
results = []
@@ -979,9 +883,7 @@ def _run_mla_benchmark_batched(
mla_dims,
vllm_config,
device,
max_num_tokens=max_total_q,
index_topk=index_topk if is_sparse else None,
kv_cache_dtype=kv_cache_dtype,
)
# Verify the actual prefill backend matches what was requested
@@ -1040,7 +942,6 @@ def _run_mla_benchmark_batched(
mla_dims,
device,
indexer=indexer,
kv_cache_dtype=kv_cache_dtype,
)
results.append(result)

View File

@@ -140,7 +140,7 @@ def _create_vllm_config(
cache_config = CacheConfig(
block_size=config.block_size,
cache_dtype=config.kv_cache_dtype,
cache_dtype="auto",
)
cache_config.num_gpu_blocks = max_num_blocks
cache_config.num_cpu_blocks = 0
@@ -215,7 +215,7 @@ def _create_backend_impl(
num_kv_heads=config.num_kv_heads,
alibi_slopes=None,
sliding_window=None,
kv_cache_dtype=config.kv_cache_dtype,
kv_cache_dtype="auto",
)
kv_cache_spec = FullAttentionSpec(
@@ -288,22 +288,12 @@ def _create_input_tensors(
total_q: int,
device: torch.device,
dtype: torch.dtype,
quantize_query: bool = False,
) -> tuple:
"""Create Q, K, V input tensors for all layers.
When quantize_query is True, queries are cast to fp8 to match backends
that require query/key/value dtype consistency.
"""
q_dtype = dtype
if quantize_query:
from vllm.platforms import current_platform
q_dtype = current_platform.fp8_dtype()
"""Create Q, K, V input tensors for all layers."""
q_list = [
torch.randn(
total_q, config.num_q_heads, config.head_dim, device=device, dtype=dtype
).to(q_dtype)
)
for _ in range(config.num_layers)
]
k_list = [
@@ -354,17 +344,10 @@ def _create_kv_cache(
# Compute inverse permutation to get back to logical view
inv_order = [stride_order.index(i) for i in range(len(stride_order))]
# Use fp8 dtype for cache when requested.
cache_dtype = dtype
if config.kv_cache_dtype == "fp8":
from vllm.platforms import current_platform
cache_dtype = current_platform.fp8_dtype()
cache_list = []
for _ in range(config.num_layers):
# Allocate in physical layout order (contiguous in memory)
cache = torch.zeros(*physical_shape, device=device, dtype=cache_dtype)
cache = torch.zeros(*physical_shape, device=device, dtype=dtype)
# Permute to logical view
cache = cache.permute(*inv_order)
cache_list.append(cache)
@@ -409,37 +392,6 @@ def _run_single_benchmark(
)
torch.accelerator.synchronize()
# Optionally capture a CUDA graph after warmup.
# Graph replay eliminates CPU launch overhead so timings reflect pure
# kernel time.
if config.use_cuda_graphs:
graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(graph):
for i in range(config.num_layers):
impl.forward(
layer,
q_list[i],
k_list[i],
v_list[i],
cache_list[i],
attn_metadata,
output=out,
)
benchmark_fn = graph.replay
else:
def benchmark_fn():
for i in range(config.num_layers):
impl.forward(
layer,
q_list[i],
k_list[i],
v_list[i],
cache_list[i],
attn_metadata,
output=out,
)
# Benchmark
times = []
for _ in range(config.repeats):
@@ -447,7 +399,16 @@ def _run_single_benchmark(
end = torch.cuda.Event(enable_timing=True)
start.record()
benchmark_fn()
for i in range(config.num_layers):
impl.forward(
layer,
q_list[i],
k_list[i],
v_list[i],
cache_list[i],
attn_metadata,
output=out,
)
end.record()
torch.accelerator.synchronize()
@@ -541,12 +502,8 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
common_attn_metadata=common_metadata,
)
# Only quantize queries when the impl supports it
quantize_query = config.kv_cache_dtype.startswith("fp8") and getattr(
impl, "supports_quant_query_input", False
)
q_list, k_list, v_list = _create_input_tensors(
config, total_q, device, dtype, quantize_query=quantize_query
config, total_q, device, dtype
)
cache_list = _create_kv_cache(

View File

@@ -40,9 +40,9 @@ LLM engine. You can refer to the `vllm.engine.arg_utils.EngineArgs` for more
details.
"""
import dataclasses
import random
import time
from dataclasses import fields
from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
@@ -124,7 +124,7 @@ def main(args):
# Create the LLM engine
engine_args = EngineArgs.from_cli_args(args)
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
llm = LLM(**dataclasses.asdict(engine_args))
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
print("------warm up------")

View File

@@ -32,7 +32,6 @@ import dataclasses
import json
import random
import time
from dataclasses import fields
from transformers import PreTrainedTokenizerBase
@@ -197,7 +196,7 @@ def main(args):
engine_args = EngineArgs.from_cli_args(args)
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
llm = LLM(**dataclasses.asdict(engine_args))
sampling_params = SamplingParams(
temperature=0,

View File

@@ -3,10 +3,10 @@
"""Benchmark offline prioritization."""
import argparse
import dataclasses
import json
import random
import time
from dataclasses import fields
from transformers import AutoTokenizer, PreTrainedTokenizerBase
@@ -79,7 +79,7 @@ def run_vllm(
) -> float:
from vllm import LLM, SamplingParams
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
llm = LLM(**dataclasses.asdict(engine_args))
assert all(
llm.llm_engine.model_config.max_model_len >= (request[1] + request[2])

View File

@@ -750,20 +750,17 @@ def get_weight_block_size_safety(config, default_value=None):
def get_model_params(config):
architectures = getattr(config, "architectures", None) or [type(config).__name__]
architecture = architectures[0]
if architecture == "DbrxForCausalLM":
if config.architectures[0] == "DbrxForCausalLM":
E = config.ffn_config.moe_num_experts
topk = config.ffn_config.moe_top_k
intermediate_size = config.ffn_config.ffn_hidden_size
hidden_size = config.hidden_size
elif architecture == "JambaForCausalLM":
elif config.architectures[0] == "JambaForCausalLM":
E = config.num_experts
topk = config.num_experts_per_tok
intermediate_size = config.intermediate_size
hidden_size = config.hidden_size
elif architecture in (
elif config.architectures[0] in (
"DeepseekV2ForCausalLM",
"DeepseekV3ForCausalLM",
"DeepseekV32ForCausalLM",
@@ -777,7 +774,7 @@ def get_model_params(config):
topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size
hidden_size = config.hidden_size
elif architecture in (
elif config.architectures[0] in (
"Qwen2MoeForCausalLM",
"Qwen3MoeForCausalLM",
"Qwen3NextForCausalLM",
@@ -786,27 +783,23 @@ def get_model_params(config):
topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size
hidden_size = config.hidden_size
elif architecture in (
"Qwen3VLMoeForConditionalGeneration",
"Qwen3_5MoeForConditionalGeneration",
"Qwen3_5MoeTextConfig",
):
elif config.architectures[0] == "Qwen3VLMoeForConditionalGeneration":
text_config = config.get_text_config()
E = text_config.num_experts
topk = text_config.num_experts_per_tok
intermediate_size = text_config.moe_intermediate_size
hidden_size = text_config.hidden_size
elif architecture == "HunYuanMoEV1ForCausalLM":
elif config.architectures[0] == "HunYuanMoEV1ForCausalLM":
E = config.num_experts
topk = config.moe_topk[0]
intermediate_size = config.moe_intermediate_size[0]
hidden_size = config.hidden_size
elif architecture == "Qwen3OmniMoeForConditionalGeneration":
elif config.architectures[0] == "Qwen3OmniMoeForConditionalGeneration":
E = config.thinker_config.text_config.num_experts
topk = config.thinker_config.text_config.num_experts_per_tok
intermediate_size = config.thinker_config.text_config.moe_intermediate_size
hidden_size = config.thinker_config.text_config.hidden_size
elif architecture == "PixtralForConditionalGeneration":
elif config.architectures[0] == "PixtralForConditionalGeneration":
# Pixtral can contain different LLM architectures,
# recurse to get their parameters
return get_model_params(config.get_text_config())
@@ -821,23 +814,6 @@ def get_model_params(config):
return E, topk, intermediate_size, hidden_size
def resolve_dtype(config) -> torch.dtype:
if current_platform.is_rocm():
return torch.float16
dtype = getattr(config, "dtype", None)
if dtype is not None:
return dtype
if hasattr(config, "get_text_config"):
text_config = config.get_text_config()
dtype = getattr(text_config, "dtype", None)
if dtype is not None:
return dtype
return torch.bfloat16
def get_quantization_group_size(config) -> int | None:
"""Extract the quantization group size from the HF model config.
@@ -885,7 +861,7 @@ def main(args: argparse.Namespace):
else:
ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
shard_intermediate_size = 2 * intermediate_size // args.tp_size
dtype = resolve_dtype(config)
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16"
use_int4_w4a16 = args.dtype == "int4_w4a16"

View File

@@ -1,134 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch
import torch.nn.functional as F
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.transformers_utils.config import get_config
from vllm.triton_utils import triton
from vllm.utils.argparse_utils import FlexibleArgumentParser
# Dimensions supported by the DSV3 specialized kernel
DSV3_SUPPORTED_NUM_EXPERTS = [256, 384]
DSV3_SUPPORTED_HIDDEN_SIZES = [7168]
# Dimensions supported by the gpt-oss specialized kernel
GPT_OSS_SUPPORTED_NUM_EXPERTS = [32, 128]
GPT_OSS_SUPPORTED_HIDDEN_SIZES = [2880]
def get_batch_size_range(max_batch_size):
return [2**x for x in range(14) if 2**x <= max_batch_size]
def get_model_params(config):
if config.architectures[0] in (
"DeepseekV2ForCausalLM",
"DeepseekV3ForCausalLM",
"DeepseekV32ForCausalLM",
):
num_experts = config.n_routed_experts
hidden_size = config.hidden_size
elif config.architectures[0] in ("GptOssForCausalLM",):
num_experts = config.num_local_experts
hidden_size = config.hidden_size
else:
raise ValueError(f"Unsupported architecture: {config.architectures}")
return num_experts, hidden_size
def get_benchmark(model, max_batch_size, trust_remote_code):
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["batch_size"],
x_vals=get_batch_size_range(max_batch_size),
x_log=False,
line_arg="provider",
line_vals=[
"torch",
"vllm",
],
line_names=["PyTorch", "vLLM"],
styles=([("blue", "-"), ("red", "-")]),
ylabel="TFLOPs",
plot_name=f"{model} router gemm throughput",
args={},
)
)
def benchmark(batch_size, provider):
config = get_config(model=model, trust_remote_code=trust_remote_code)
num_experts, hidden_size = get_model_params(config)
mat_a = torch.randn(
(batch_size, hidden_size), dtype=torch.bfloat16, device="cuda"
).contiguous()
mat_b = torch.randn(
(num_experts, hidden_size), dtype=torch.bfloat16, device="cuda"
).contiguous()
bias = torch.randn(
num_experts, dtype=torch.bfloat16, device="cuda"
).contiguous()
is_hopper_or_blackwell = current_platform.is_device_capability(
90
) or current_platform.is_device_capability_family(100)
allow_dsv3_router_gemm = (
is_hopper_or_blackwell
and num_experts in DSV3_SUPPORTED_NUM_EXPERTS
and hidden_size in DSV3_SUPPORTED_HIDDEN_SIZES
)
allow_gpt_oss_router_gemm = (
is_hopper_or_blackwell
and num_experts in GPT_OSS_SUPPORTED_NUM_EXPERTS
and hidden_size in GPT_OSS_SUPPORTED_HIDDEN_SIZES
)
has_bias = False
if allow_gpt_oss_router_gemm:
has_bias = True
quantiles = [0.5, 0.2, 0.8]
if provider == "torch":
def runner():
if has_bias:
F.linear(mat_a, mat_b, bias)
else:
F.linear(mat_a, mat_b)
elif provider == "vllm":
def runner():
if allow_dsv3_router_gemm:
ops.dsv3_router_gemm(mat_a, mat_b, torch.bfloat16)
elif allow_gpt_oss_router_gemm:
ops.gpt_oss_router_gemm(mat_a, mat_b, bias)
else:
raise ValueError("Unsupported router gemm")
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
runner, quantiles=quantiles
)
def tflops(t_ms):
flops = 2 * batch_size * hidden_size * num_experts
return flops / (t_ms * 1e-3) / 1e12
return tflops(ms), tflops(max_ms), tflops(min_ms)
return benchmark
if __name__ == "__main__":
parser = FlexibleArgumentParser()
parser.add_argument("--model", type=str, default="openai/gpt-oss-20b")
parser.add_argument("--max-batch-size", default=16, type=int)
parser.add_argument("--trust-remote-code", action="store_true")
args = parser.parse_args()
# Get the benchmark function
benchmark = get_benchmark(args.model, args.max_batch_size, args.trust_remote_code)
# Run performance benchmark
benchmark.run(print_data=True)

View File

@@ -27,7 +27,7 @@ def get_attn_isa(
else:
if current_platform.get_cpu_architecture() == CpuArchEnum.ARM:
return "neon"
elif torch.cpu._is_amx_tile_supported():
elif torch._C._cpu._is_amx_tile_supported():
return "amx"
else:
return "vec"

View File

@@ -24,7 +24,7 @@ except (ImportError, AttributeError) as e:
sys.exit(1)
# ISA selection following test_cpu_fused_moe.py pattern
ISA_CHOICES = ["amx", "vec"] if torch.cpu._is_amx_tile_supported() else ["vec"]
ISA_CHOICES = ["amx", "vec"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
@torch.inference_mode()

View File

@@ -39,7 +39,7 @@ else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG 29210221863736a08f71a866459e368ad1ac4a95
GIT_TAG 1488682bb545f7d020e958a33116b1419d1cfc83
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

View File

@@ -173,13 +173,10 @@ ScratchPadManager::ScratchPadManager() : size_(0), ptr_(nullptr) {
void ScratchPadManager::realloc(size_t new_size) {
new_size = round(new_size);
if (new_size > size_) {
void* new_ptr = std::aligned_alloc(64, new_size);
TORCH_CHECK(new_ptr != nullptr,
"ScratchPadManager: aligned_alloc failed for size ", new_size);
if (ptr_ != nullptr) {
std::free(ptr_);
}
ptr_ = new_ptr;
ptr_ = std::aligned_alloc(64, new_size);
size_ = new_size;
}
}

View File

@@ -1,9 +0,0 @@
#pragma once
#include <torch/csrc/stable/library.h>
#include <torch/csrc/stable/tensor.h>
#ifndef USE_ROCM
torch::stable::Tensor permute_cols(torch::stable::Tensor const& A,
torch::stable::Tensor const& perm);
#endif

View File

@@ -1,21 +0,0 @@
#include "ops.h"
#include "core/registration.h"
#include <torch/csrc/stable/library.h>
// Register ops with STABLE_TORCH_LIBRARY for libtorch stable ABI compatibility.
// Note: We register under namespace "_C" so ops are accessible as
// torch.ops._C.<op_name> for compatibility with existing code.
STABLE_TORCH_LIBRARY_FRAGMENT(_C, m) {
#ifndef USE_ROCM
m.def("permute_cols(Tensor A, Tensor perm) -> Tensor");
#endif
}
STABLE_TORCH_LIBRARY_IMPL(_C, CUDA, m) {
#ifndef USE_ROCM
m.impl("permute_cols", TORCH_BOX(&permute_cols));
#endif
}
REGISTER_EXTENSION(_C_stable_libtorch)

View File

@@ -1,13 +0,0 @@
#pragma once
#include <torch/csrc/inductor/aoti_torch/c/shim.h>
#include <cuda_runtime.h>
// Utility to get the current CUDA stream for a given device using stable APIs.
// Returns a cudaStream_t for use in kernel launches.
inline cudaStream_t get_current_cuda_stream(int32_t device_index) {
void* stream_ptr = nullptr;
TORCH_ERROR_CODE_CHECK(
aoti_torch_get_current_cuda_stream(device_index, &stream_ptr));
return reinterpret_cast<cudaStream_t>(stream_ptr);
}

View File

@@ -1,144 +0,0 @@
/*
* Adapted from
* https://github.com/NVIDIA/TensorRT-LLM/blob/v1.3.0rc7/cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_cuda.cu
* Copyright (c) 2025, The vLLM team.
* SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
* All rights reserved. SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAStream.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <torch/all.h>
#include "gpt_oss_router_gemm.cuh"
void launch_gpt_oss_router_gemm(__nv_bfloat16* gA, __nv_bfloat16* gB,
__nv_bfloat16* gC, __nv_bfloat16* bias,
int batch_size, int output_features,
int input_features, cudaStream_t stream) {
static int const WARP_TILE_M = 16;
static int const TILE_M = WARP_TILE_M;
static int const TILE_N = 8;
static int const TILE_K = 64;
static int const STAGES = 16;
static int const STAGE_UNROLL = 4;
static bool const PROFILE = false;
CUtensorMap weight_map{};
CUtensorMap activation_map{};
constexpr uint32_t rank = 2;
uint64_t size[rank] = {(uint64_t)input_features, (uint64_t)output_features};
uint64_t stride[rank - 1] = {input_features * sizeof(__nv_bfloat16)};
uint32_t box_size[rank] = {TILE_K, TILE_M};
uint32_t elem_stride[rank] = {1, 1};
CUresult res = cuTensorMapEncodeTiled(
&weight_map, CUtensorMapDataType::CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, rank,
gB, size, stride, box_size, elem_stride,
CUtensorMapInterleave::CU_TENSOR_MAP_INTERLEAVE_NONE,
CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_128B,
CUtensorMapL2promotion::CU_TENSOR_MAP_L2_PROMOTION_NONE,
CUtensorMapFloatOOBfill::CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
TORCH_CHECK(res == CUDA_SUCCESS,
"cuTensorMapEncodeTiled failed for weight_map, error code=",
static_cast<int>(res));
size[1] = batch_size;
box_size[1] = TILE_N;
res = cuTensorMapEncodeTiled(
&activation_map, CUtensorMapDataType::CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,
rank, gA, size, stride, box_size, elem_stride,
CUtensorMapInterleave::CU_TENSOR_MAP_INTERLEAVE_NONE,
CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_128B,
CUtensorMapL2promotion::CU_TENSOR_MAP_L2_PROMOTION_NONE,
CUtensorMapFloatOOBfill::CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
TORCH_CHECK(res == CUDA_SUCCESS,
"cuTensorMapEncodeTiled failed for activation_map, error code=",
static_cast<int>(res));
int smem_size = STAGES * STAGE_UNROLL *
(TILE_M * TILE_K * sizeof(__nv_bfloat16) +
TILE_N * TILE_K * sizeof(__nv_bfloat16));
gpuErrChk(cudaFuncSetAttribute(
gpt_oss_router_gemm_kernel<WARP_TILE_M, TILE_M, TILE_N, TILE_K, STAGES,
STAGE_UNROLL, PROFILE>,
cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
int tiles_m = (output_features + TILE_M - 1) / TILE_M;
int tiles_n = (batch_size + TILE_N - 1) / TILE_N;
dim3 grid(tiles_m, tiles_n);
dim3 block(384);
cudaLaunchConfig_t config;
cudaLaunchAttribute attrs[1];
config.gridDim = grid;
config.blockDim = block;
config.dynamicSmemBytes = smem_size;
config.stream = stream;
config.attrs = attrs;
attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
attrs[0].val.programmaticStreamSerializationAllowed = 1;
config.numAttrs = 1;
cudaLaunchKernelEx(
&config,
&gpt_oss_router_gemm_kernel<WARP_TILE_M, TILE_M, TILE_N, TILE_K, STAGES,
STAGE_UNROLL, PROFILE>,
gC, gA, gB, bias, output_features, batch_size, input_features, weight_map,
activation_map, nullptr);
}
void gpt_oss_router_gemm_cuda_forward(torch::Tensor& output,
torch::Tensor input, torch::Tensor weight,
torch::Tensor bias) {
auto const batch_size = input.size(0);
auto const input_dim = input.size(1);
auto const output_dim = weight.size(0);
auto stream = at::cuda::getCurrentCUDAStream();
if (input.scalar_type() == at::ScalarType::BFloat16) {
launch_gpt_oss_router_gemm((__nv_bfloat16*)input.data_ptr(),
(__nv_bfloat16*)weight.data_ptr(),
(__nv_bfloat16*)output.mutable_data_ptr(),
(__nv_bfloat16*)bias.data_ptr(), batch_size,
output_dim, input_dim, stream);
} else {
throw std::invalid_argument("Unsupported dtype, only supports bfloat16");
}
}
void gpt_oss_router_gemm(torch::Tensor& output, torch::Tensor input,
torch::Tensor weight, torch::Tensor bias) {
TORCH_CHECK(input.dim() == 2, "input must be 2D");
TORCH_CHECK(weight.dim() == 2, "weight must be 2D");
TORCH_CHECK(bias.dim() == 1, "bias must be 1D");
TORCH_CHECK(input.sizes()[1] == weight.sizes()[1],
"input.size(1) must match weight.size(1)");
TORCH_CHECK(weight.sizes()[0] == bias.sizes()[0],
"weight.size(0) must match bias.size(0)");
TORCH_CHECK(input.scalar_type() == at::ScalarType::BFloat16,
"input tensor must be bfloat16");
TORCH_CHECK(weight.scalar_type() == at::ScalarType::BFloat16,
"weight tensor must be bfloat16");
TORCH_CHECK(bias.scalar_type() == at::ScalarType::BFloat16,
"bias tensor must be bfloat16");
gpt_oss_router_gemm_cuda_forward(output, input, weight, bias);
}

View File

@@ -1,447 +0,0 @@
/*
* Adapted from
* https://github.com/NVIDIA/TensorRT-LLM/blob/v1.3.0rc7/cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_kernel.cuh
* Copyright (c) 2025, The vLLM team.
* SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
* All rights reserved. SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "cuda_bf16.h"
#include <stdint.h>
#include <stdio.h>
#include <vector>
#include "cuda_pipeline.h"
#include <cuda.h>
#include <cuda/barrier>
#include <cuda/std/utility>
#include <cuda_runtime.h>
using barrier = cuda::barrier<cuda::thread_scope_block>;
namespace cde = cuda::device::experimental;
namespace ptx = cuda::ptx;
#define gpuErrChk(ans) \
{ \
gpuAssert((ans), __FILE__, __LINE__); \
}
inline void gpuAssert(cudaError_t code, char const* file, int line,
bool abort = true) {
if (code != cudaSuccess) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
line);
if (abort) {
throw std::runtime_error(cudaGetErrorString(code));
}
}
}
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
__device__ uint64_t gclock64() {
unsigned long long int rv;
asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(rv));
return rv;
}
__device__ void ldmatrix(__nv_bfloat16 rv[2], uint32_t smem_ptr) {
int dst;
asm volatile("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];\n"
: "=r"(dst)
: "r"(smem_ptr));
int* rvi = reinterpret_cast<int*>(&rv[0]);
rvi[0] = dst;
}
__device__ void ldmatrix2(__nv_bfloat16 rv[4], uint32_t smem_ptr) {
int x, y;
asm volatile("ldmatrix.sync.aligned.x2.m8n8.shared.b16 {%0, %1}, [%2];\n"
: "=r"(x), "=r"(y)
: "r"(smem_ptr));
int* rvi = reinterpret_cast<int*>(&rv[0]);
rvi[0] = x;
rvi[1] = y;
}
__device__ void ldmatrix4(__nv_bfloat16 rv[8], uint32_t smem_ptr) {
int x, y, z, w;
asm volatile(
"ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];"
: "=r"(x), "=r"(y), "=r"(z), "=r"(w)
: "r"(smem_ptr));
int* rvi = reinterpret_cast<int*>(&rv[0]);
rvi[0] = x;
rvi[1] = y;
rvi[2] = z;
rvi[3] = w;
}
__device__ void HMMA_1688(float d[4], __nv_bfloat16 a[4], __nv_bfloat16 b[2],
float c[4]) {
uint32_t const* A = reinterpret_cast<uint32_t const*>(&a[0]);
uint32_t const* B = reinterpret_cast<uint32_t const*>(&b[0]);
float const* C = reinterpret_cast<float const*>(&c[0]);
float* D = reinterpret_cast<float*>(&d[0]);
asm volatile(
"mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32 "
"{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
: "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
: "r"(A[0]), "r"(A[1]), "r"(B[0]), "f"(C[0]), "f"(C[1]), "f"(C[2]),
"f"(C[3]));
}
__device__ void HMMA_16816(float d[4], __nv_bfloat16 a[8], __nv_bfloat16 b[4],
float c[4]) {
uint32_t const* A = reinterpret_cast<uint32_t const*>(&a[0]);
uint32_t const* B = reinterpret_cast<uint32_t const*>(&b[0]);
float const* C = reinterpret_cast<float const*>(&c[0]);
float* D = reinterpret_cast<float*>(&d[0]);
asm volatile(
"mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
: "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
: "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
"f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
}
__device__ void bar_wait(uint32_t bar_ptr, int phase) {
asm volatile(
"{\n"
".reg .pred P1;\n"
"LAB_WAIT:\n"
"mbarrier.try_wait.parity.shared::cta.b64 P1, [%0], %1;\n"
"@P1 bra.uni DONE;\n"
"bra.uni LAB_WAIT;\n"
"DONE:\n"
"}\n" ::"r"(bar_ptr),
"r"(phase));
}
__device__ bool bar_try_wait(uint32_t bar_ptr, int phase) {
uint32_t success;
#ifdef INTERNAL
asm volatile(".pragma \"set knob DontInsertYield\";\n" : : : "memory");
#endif
asm volatile(
"{\n\t"
".reg .pred P1; \n\t"
"mbarrier.try_wait.parity.shared::cta.b64 P1, [%1], %2; \n\t"
"selp.b32 %0, 1, 0, P1; \n\t"
"}"
: "=r"(success)
: "r"(bar_ptr), "r"(phase));
return success;
}
__device__ uint32_t elect_one_sync() {
uint32_t pred = 0;
uint32_t laneid = 0;
asm volatile(
"{\n"
".reg .b32 %%rx;\n"
".reg .pred %%px;\n"
" elect.sync %%rx|%%px, %2;\n"
"@%%px mov.s32 %1, 1;\n"
" mov.s32 %0, %%rx;\n"
"}\n"
: "+r"(laneid), "+r"(pred)
: "r"(0xFFFFFFFF));
return pred;
}
#endif
struct Profile {
uint64_t start;
uint64_t weight_load_start;
uint64_t act_load_start;
uint64_t compute_start;
uint64_t complete;
};
template <int WARP_TILE_M, int TILE_M, int TILE_N, int TILE_K, int STAGES,
int STAGE_UNROLL, bool PROFILE>
__global__ __launch_bounds__(384, 1) void gpt_oss_router_gemm_kernel(
__nv_bfloat16* output, __nv_bfloat16* weights, __nv_bfloat16* activations,
__nv_bfloat16* bias, int M, int N, int K,
const __grid_constant__ CUtensorMap weight_map,
const __grid_constant__ CUtensorMap activation_map,
Profile* profile = nullptr) {
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
if (PROFILE && threadIdx.x == 0 && blockIdx.y == 0)
profile[blockIdx.x].start = gclock64();
extern __shared__ __align__(128) char smem[];
__nv_bfloat16* sh_weights = (__nv_bfloat16*)&smem[0];
__nv_bfloat16* sh_activations =
(__nv_bfloat16*)&smem[STAGES * STAGE_UNROLL * TILE_M * TILE_K *
sizeof(__nv_bfloat16)];
#pragma nv_diag_suppress static_var_with_dynamic_init
__shared__ barrier bar_wt_ready[STAGES];
__shared__ barrier bar_act_ready[STAGES];
__shared__ barrier bar_data_consumed[STAGES];
__shared__ float4 reduction_buffer[128];
__shared__ nv_bfloat16 sh_bias[TILE_M];
if (threadIdx.x == 0) {
for (int i = 0; i < STAGES; i++) {
init(&bar_wt_ready[i], 1);
init(&bar_act_ready[i], 1);
init(&bar_data_consumed[i], 32);
}
ptx::fence_proxy_async(ptx::space_shared);
asm volatile("prefetch.tensormap [%0];"
:
: "l"(reinterpret_cast<uint64_t>(&weight_map))
: "memory");
asm volatile("prefetch.tensormap [%0];"
:
: "l"(reinterpret_cast<uint64_t>(&activation_map))
: "memory");
}
__syncthreads();
int warp_id = threadIdx.x / 32;
int lane_id = threadIdx.x % 32;
int phase = 0;
int mib = blockIdx.x * TILE_M;
int ni = blockIdx.y * TILE_N;
float accum[4];
for (int i = 0; i < 4; i++) accum[i] = 0.f;
int const K_LOOPS_DMA =
(K + 4 * TILE_K * STAGE_UNROLL - 1) / (4 * (TILE_K * STAGE_UNROLL));
int const K_LOOPS_COMPUTE = K_LOOPS_DMA;
// Data loading thread
if (warp_id >= 4 && elect_one_sync()) {
int stage = warp_id % 4;
bool weight_warp = warp_id < 8;
if (!weight_warp) {
cudaGridDependencySynchronize();
cudaTriggerProgrammaticLaunchCompletion();
}
for (int ki = 0; ki < K_LOOPS_DMA; ki++) {
int k = (ki * 4 + (warp_id % 4)) * TILE_K * STAGE_UNROLL;
uint64_t desc_ptr_wt = reinterpret_cast<uint64_t>(&weight_map);
uint64_t desc_ptr_act = reinterpret_cast<uint64_t>(&activation_map);
uint32_t bar_ptr_wt = __cvta_generic_to_shared(&bar_wt_ready[stage]);
uint32_t bar_ptr_act = __cvta_generic_to_shared(&bar_act_ready[stage]);
int bytes_wt = TILE_M * TILE_K * sizeof(__nv_bfloat16);
int bytes_act = TILE_N * TILE_K * sizeof(__nv_bfloat16);
bar_wait(__cvta_generic_to_shared(&bar_data_consumed[stage]), phase ^ 1);
if (weight_warp)
asm volatile("mbarrier.arrive.expect_tx.shared.b64 _, [%0], %1;"
:
: "r"(bar_ptr_wt), "r"(STAGE_UNROLL * bytes_wt));
if (!weight_warp)
asm volatile("mbarrier.arrive.expect_tx.shared.b64 _, [%0], %1;"
:
: "r"(bar_ptr_act), "r"(STAGE_UNROLL * bytes_act));
if (PROFILE && blockIdx.y == 0 && ki == 0 && weight_warp)
profile[blockIdx.x].weight_load_start = gclock64();
if (PROFILE && blockIdx.y == 0 && ki == 0 && !weight_warp)
profile[blockIdx.x].act_load_start = gclock64();
for (int i = 0; i < STAGE_UNROLL; i++) {
uint32_t smem_ptr_wt = __cvta_generic_to_shared(
&sh_weights[(stage * STAGE_UNROLL + i) * TILE_M * TILE_K]);
uint32_t crd0 = k + i * TILE_K;
uint32_t crd1 = mib;
if (weight_warp)
asm volatile(
"cp.async.bulk.tensor.2d.shared::cta.global.mbarrier::complete_"
"tx::bytes [%0], [%1, {%3,%4}], "
"[%2];"
:
: "r"(smem_ptr_wt), "l"(desc_ptr_wt), "r"(bar_ptr_wt), "r"(crd0),
"r"(crd1)
: "memory");
uint32_t smem_ptr_act = __cvta_generic_to_shared(
&sh_activations[(stage * STAGE_UNROLL + i) * TILE_N * TILE_K]);
crd0 = k + i * TILE_K;
crd1 = ni;
if (!weight_warp)
asm volatile(
"cp.async.bulk.tensor.2d.shared::cta.global.mbarrier::complete_"
"tx::bytes [%0], [%1, {%3,%4}], "
"[%2];"
:
: "r"(smem_ptr_act), "l"(desc_ptr_act), "r"(bar_ptr_act),
"r"(crd0), "r"(crd1)
: "memory");
}
stage += 4;
if (stage >= STAGES) {
stage = warp_id % 4;
phase ^= 1;
}
}
// Wait for pending loads to be consumed before exiting, to avoid race
for (int i = 0; i < (STAGES / 4) - 1; i++) {
bar_wait(__cvta_generic_to_shared(&bar_data_consumed[stage]), phase ^ 1);
stage += 4;
if (stage >= STAGES) {
stage = warp_id % 4;
phase ^= 1;
}
}
}
// Compute threads
else if (warp_id < 4) {
// Sneak the bias load into the compute warps since they're just waiting for
// stuff anyway
if (threadIdx.x < TILE_M) sh_bias[threadIdx.x] = bias[mib + threadIdx.x];
int stage = warp_id;
int phase = 0;
int lane_id_div8 = lane_id / 8;
int lane_id_mod8 = lane_id % 8;
int lane_row_offset_wt = (lane_id_div8 % 2) ? 8 : 0;
int lane_col_offset_wt = (lane_id_div8 / 2) ? 1 : 0;
int row_wt = lane_id_mod8 + lane_row_offset_wt;
int row_act = lane_id_mod8;
int row_offset_wt = (reinterpret_cast<uintptr_t>(sh_weights) / 128) % 8;
int row_offset_act = row_offset_wt;
uint32_t bar_ptr_wt = __cvta_generic_to_shared(&bar_wt_ready[stage]);
uint32_t bar_ptr_act = __cvta_generic_to_shared(&bar_act_ready[stage]);
bool weight_ready = bar_try_wait(bar_ptr_wt, phase);
bool act_ready = bar_try_wait(bar_ptr_act, phase);
#pragma unroll 2
for (int ki = 0; ki < K_LOOPS_COMPUTE; ki++) {
int next_stage = stage + 4;
int next_phase = phase;
if (next_stage >= STAGES) {
next_stage = warp_id;
next_phase ^= 1;
}
while (!weight_ready || !act_ready) {
weight_ready = bar_try_wait(bar_ptr_wt, phase);
act_ready = bar_try_wait(bar_ptr_act, phase);
}
if (PROFILE && blockIdx.y == 0 && threadIdx.x == 0 && ki == 0)
profile[blockIdx.x].compute_start = gclock64();
if (ki + 1 < K_LOOPS_COMPUTE) {
weight_ready = bar_try_wait(
__cvta_generic_to_shared(&bar_wt_ready[next_stage]), next_phase);
act_ready = bar_try_wait(
__cvta_generic_to_shared(&bar_act_ready[next_stage]), next_phase);
}
#pragma unroll
for (int su = 0; su < STAGE_UNROLL; su++) {
__nv_bfloat16* ptr_weights =
&sh_weights[(stage * STAGE_UNROLL + su) * TILE_M * TILE_K];
__nv_bfloat16* ptr_act =
&sh_activations[(stage * STAGE_UNROLL + su) * TILE_N * TILE_K];
#pragma unroll
for (int kii = 0; kii < TILE_K / 16; kii++) {
__nv_bfloat16 a[8];
__nv_bfloat16 b[4];
int col = 2 * kii + lane_col_offset_wt;
int col_sw = ((row_wt + row_offset_wt) % 8) ^ col;
ldmatrix4(a, __cvta_generic_to_shared(
&ptr_weights[row_wt * TILE_K + col_sw * 8]));
col = 2 * kii + lane_id_div8;
col_sw = ((row_act + row_offset_act) % 8) ^ col;
ldmatrix2(b, __cvta_generic_to_shared(
&ptr_act[row_act * TILE_K + 8 * col_sw]));
HMMA_16816(accum, a, b, accum);
}
}
uint32_t bar_c = __cvta_generic_to_shared(&bar_data_consumed[stage]);
asm volatile("mbarrier.arrive.shared::cta.b64 _, [%0];" : : "r"(bar_c));
stage = next_stage;
phase = next_phase;
}
float4 accum4;
accum4.x = accum[0];
accum4.y = accum[1];
accum4.z = accum[2];
accum4.w = accum[3];
reduction_buffer[threadIdx.x] = accum4;
__syncthreads();
if (warp_id == 0) {
int mi = mib + warp_id * WARP_TILE_M;
int tm = mi + lane_id / 4;
int tn = ni + 2 * (lane_id % 4);
float4 accum1 = reduction_buffer[32 + threadIdx.x];
float4 accum2 = reduction_buffer[64 + threadIdx.x];
float4 accum3 = reduction_buffer[96 + threadIdx.x];
accum[0] = accum[0] + accum1.x + accum2.x + accum3.x;
accum[1] = accum[1] + accum1.y + accum2.y + accum3.y;
accum[2] = accum[2] + accum1.z + accum2.z + accum3.z;
accum[3] = accum[3] + accum1.w + accum2.w + accum3.w;
float bias_lo = __bfloat162float(sh_bias[tm - mib]);
float bias_hi = __bfloat162float(sh_bias[tm + 8 - mib]);
if (tn < N && tm < M)
output[tn * M + tm] = __float2bfloat16(accum[0] + bias_lo);
if (tn + 1 < N && tm < M)
output[(tn + 1) * M + tm] = __float2bfloat16(accum[1] + bias_lo);
if (tn < N && tm + 8 < M)
output[tn * M + tm + 8] = __float2bfloat16(accum[2] + bias_hi);
if (tn + 1 < N && tm + 8 < M)
output[(tn + 1) * M + tm + 8] = __float2bfloat16(accum[3] + bias_hi);
if (PROFILE && blockIdx.y == 0 && threadIdx.x == 0)
profile[blockIdx.x].complete = gclock64();
}
}
#endif // end if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
}

View File

@@ -70,8 +70,4 @@ torch::Tensor router_gemm_bf16_fp32(torch::Tensor const& input,
// Supports num_tokens in [1, 16], num_experts in {256, 384}, hidden_dim = 7168
void dsv3_router_gemm(torch::Tensor& output, const torch::Tensor& mat_a,
const torch::Tensor& mat_b);
// gpt-oss optimized router GEMM kernel for SM90+
void gpt_oss_router_gemm(torch::Tensor& output, torch::Tensor input,
torch::Tensor weight, torch::Tensor bias);
#endif

View File

@@ -132,12 +132,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
// DeepSeek V3 optimized router GEMM for SM90+
m.def("dsv3_router_gemm(Tensor! output, Tensor mat_a, Tensor mat_b) -> ()");
// conditionally compiled so impl registration is in source file
// gpt-oss optimized router GEMM kernel for SM90+
m.def(
"gpt_oss_router_gemm(Tensor! output, Tensor input, Tensor weights, "
"Tensor bias) -> ()");
m.impl("gpt_oss_router_gemm", torch::kCUDA, &gpt_oss_router_gemm);
#endif
}

View File

@@ -201,6 +201,7 @@ torch::Tensor awq_dequantize(torch::Tensor _kernel,
torch::Tensor _zeros, int64_t split_k_iters,
int64_t thx, int64_t thy);
torch::Tensor permute_cols(torch::Tensor const& A, torch::Tensor const& perm);
#endif
torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m,
@@ -261,8 +262,7 @@ void get_cutlass_moe_mm_data(
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
torch::Tensor& input_permutation, torch::Tensor& output_permutation,
const int64_t num_experts, const int64_t n, const int64_t k,
const std::optional<torch::Tensor>& blockscale_offsets,
const bool is_gated);
const std::optional<torch::Tensor>& blockscale_offsets);
void get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
const torch::Tensor& expert_first_token_offset,

View File

@@ -1,13 +1,10 @@
#include <torch/csrc/stable/library.h>
#include <torch/csrc/stable/tensor.h>
#include <torch/csrc/stable/accelerator.h>
#include <torch/csrc/stable/ops.h>
#include <torch/headeronly/core/ScalarType.h>
#include <torch/all.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda_fp16.h>
#include "torch_utils.h"
static constexpr int default_threads = 256;
static constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
@@ -67,22 +64,19 @@ __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
// More efficient version of A[..., perm]
// taken from gptq_marlin.cu
torch::stable::Tensor permute_cols(torch::stable::Tensor const& A,
torch::stable::Tensor const& perm) {
const int32_t dev = A.get_device_index();
const torch::stable::accelerator::DeviceGuard device_guard(dev);
const auto stream = get_current_cuda_stream(dev);
torch::Tensor permute_cols(torch::Tensor const& A, torch::Tensor const& perm) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
auto dev = A.get_device();
auto stream = at::cuda::getCurrentCUDAStream(dev);
STD_TORCH_CHECK(
A.scalar_type() == torch::headeronly::ScalarType::Half ||
A.scalar_type() == torch::headeronly::ScalarType::BFloat16,
"Currently only 16bit types are supported");
STD_TORCH_CHECK(A.is_contiguous(), "A must be contiguous");
STD_TORCH_CHECK(A.size(-1) % 8 == 0,
"A columns must be a multiple of 8 (128bits)");
auto A_2d = torch::stable::view(A, {-1, A.size(-1)});
TORCH_CHECK(A.scalar_type() == at::kHalf || A.scalar_type() == at::kBFloat16,
"Currently only 16bit types are supported");
TORCH_CHECK(A.is_contiguous(), "A must be contiguous");
TORCH_CHECK(A.size(-1) % 8 == 0,
"A columns must be a multiple of 8 (128bits)");
auto A_2d = A.view({-1, A.size(-1)});
torch::stable::Tensor D = torch::stable::empty_like(A);
torch::Tensor D = torch::empty_like(A);
int sms;
cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
int block_rows = div_ceil(A_2d.size(0), sms);

View File

@@ -286,15 +286,6 @@ void rms_norm_per_block_quant(torch::Tensor& out, torch::Tensor const& input,
"Outer scale stride must be 1 when scales are not transposed");
}
int64_t hidden_size = input.size(-1);
TORCH_CHECK(hidden_size > 0 && hidden_size % group_size == 0,
"hidden_size must be a positive multiple of group_size");
int64_t num_tokens = input.numel() / hidden_size;
int64_t num_groups = hidden_size / group_size;
TORCH_CHECK(scales.numel() >= num_tokens * num_groups,
"scales buffer too small: need ", num_tokens * num_groups,
" elements, got ", scales.numel());
rms_norm_per_block_quant_dispatch(out, input, weight, scales, group_size,
var_epsilon, scale_ub, residual,
is_scale_transposed);

View File

@@ -17,11 +17,8 @@ __global__ void compute_problem_sizes(const int32_t* __restrict__ topk_ids,
int32_t* problem_sizes2,
int32_t* atomic_buffer,
const int topk_length, const int n,
const int k, const bool is_gated) {
const int k) {
int expert_id = blockIdx.x;
// For gated activations (gate + up), first GEMM output is 2*n.
// For non-gated activations (up only), first GEMM output is n.
int const n1 = is_gated ? 2 * n : n;
int occurrences = 0;
for (int i = threadIdx.x; i < topk_length; i += THREADS_PER_EXPERT) {
@@ -34,13 +31,13 @@ __global__ void compute_problem_sizes(const int32_t* __restrict__ topk_ids,
int final_occurrences = atomic_buffer[expert_id];
if constexpr (!SWAP_AB) {
problem_sizes1[expert_id * 3] = final_occurrences;
problem_sizes1[expert_id * 3 + 1] = n1;
problem_sizes1[expert_id * 3 + 1] = 2 * n;
problem_sizes1[expert_id * 3 + 2] = k;
problem_sizes2[expert_id * 3] = final_occurrences;
problem_sizes2[expert_id * 3 + 1] = k;
problem_sizes2[expert_id * 3 + 2] = n;
} else {
problem_sizes1[expert_id * 3] = n1;
problem_sizes1[expert_id * 3] = 2 * n;
problem_sizes1[expert_id * 3 + 1] = final_occurrences;
problem_sizes1[expert_id * 3 + 2] = k;
problem_sizes2[expert_id * 3] = k;
@@ -110,11 +107,13 @@ __global__ void compute_arg_sorts(const int32_t* __restrict__ topk_ids,
}
namespace {
inline void launch_compute_problem_sizes(
const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
torch::Tensor& problem_sizes2, torch::Tensor& atomic_buffer,
int64_t num_experts, int64_t n, int64_t k, cudaStream_t stream,
const bool swap_ab, const bool is_gated) {
inline void launch_compute_problem_sizes(const torch::Tensor& topk_ids,
torch::Tensor& problem_sizes1,
torch::Tensor& problem_sizes2,
torch::Tensor& atomic_buffer,
int64_t num_experts, int64_t n,
int64_t k, cudaStream_t stream,
const bool swap_ab) {
int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel());
auto const* topk_ptr = topk_ids.data_ptr<int32_t>();
@@ -126,7 +125,7 @@ inline void launch_compute_problem_sizes(
compute_problem_sizes<SwapAB><<<num_experts, num_threads, 0, stream>>>(
topk_ptr, ps1_ptr, ps2_ptr, atomic_ptr,
static_cast<int>(topk_ids.numel()), static_cast<int>(n),
static_cast<int>(k), is_gated);
static_cast<int>(k));
});
}
} // namespace
@@ -223,8 +222,7 @@ void get_cutlass_moe_mm_data_caller(
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
torch::Tensor& input_permutation, torch::Tensor& output_permutation,
const int64_t num_experts, const int64_t n, const int64_t k,
const std::optional<torch::Tensor>& blockscale_offsets,
const bool is_gated) {
const std::optional<torch::Tensor>& blockscale_offsets) {
auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
auto options_int32 =
torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
@@ -238,7 +236,7 @@ void get_cutlass_moe_mm_data_caller(
launch_compute_problem_sizes(topk_ids, problem_sizes1, problem_sizes2,
atomic_buffer, num_experts, n, k, stream,
may_swap_ab, is_gated);
may_swap_ab);
if (blockscale_offsets.has_value()) {
// fp4 path

View File

@@ -75,8 +75,7 @@ void get_cutlass_moe_mm_data_caller(
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
torch::Tensor& input_permutation, torch::Tensor& output_permutation,
const int64_t num_experts, const int64_t n, const int64_t k,
const std::optional<torch::Tensor>& blockscale_offsets,
const bool is_gated);
const std::optional<torch::Tensor>& blockscale_offsets);
void get_cutlass_moe_mm_problem_sizes_from_expert_offsets_caller(
const torch::Tensor& expert_first_token_offset,
@@ -279,8 +278,7 @@ void get_cutlass_moe_mm_data(
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
torch::Tensor& input_permutation, torch::Tensor& output_permutation,
const int64_t num_experts, const int64_t n, const int64_t k,
const std::optional<torch::Tensor>& blockscale_offsets,
const bool is_gated) {
const std::optional<torch::Tensor>& blockscale_offsets) {
// This function currently gets compiled only if we have a valid cutlass moe
// mm to run it for.
int32_t version_num = get_sm_version_num();
@@ -290,7 +288,7 @@ void get_cutlass_moe_mm_data(
get_cutlass_moe_mm_data_caller(topk_ids, expert_offsets, problem_sizes1,
problem_sizes2, input_permutation,
output_permutation, num_experts, n, k,
blockscale_offsets, is_gated);
blockscale_offsets);
return;
#endif
TORCH_CHECK_NOT_IMPLEMENTED(

View File

@@ -26,16 +26,6 @@
#define __HIP__GFX9__
#endif
#if defined(__HIPCC__) && \
(defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1150__) || \
defined(__gfx1151__) || defined(__gfx1200__) || defined(__gfx1201__))
#define __HIP__GFX1X__
#endif
#if defined(__HIPCC__) && (defined(__gfx1200__) || defined(__gfx1201__))
#define __HIP__GFX12__
#endif
#if defined(__HIPCC__) && (defined(__gfx942__) || defined(__gfx950__))
#define __HIP__MI3XX__
#endif
@@ -47,31 +37,15 @@
#endif
int get_lds_size() {
static const int result = [] {
const auto* dprops = at::cuda::getCurrentDeviceProperties();
const std::string device_arch = dprops->gcnArchName;
return device_arch.find("gfx95") == std::string::npos ? 64 * 1024
: 160 * 1024;
}();
return result;
}
bool on_gfx1x() {
static const bool result = [] {
const auto* dprops = at::cuda::getCurrentDeviceProperties();
const std::string device_arch = dprops->gcnArchName;
return device_arch.find("gfx11") != std::string::npos ||
device_arch.find("gfx12") != std::string::npos;
}();
return result;
}
bool on_gfx12() {
static const bool result = [] {
const auto* dprops = at::cuda::getCurrentDeviceProperties();
const std::string device_arch = dprops->gcnArchName;
return device_arch.find("gfx12") != std::string::npos;
}();
static bool is_cached = false;
static int result;
if (is_cached == false) {
auto dprops = at::cuda::getCurrentDeviceProperties();
std::string device_arch = dprops->gcnArchName;
size_t substring = device_arch.find("gfx95");
result = (substring == std::string::npos ? 64 * 1024 : 160 * 1024);
is_cached = true;
}
return result;
}
@@ -312,35 +286,21 @@ torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b,
return out_c;
}
#if defined(__HIP__GFX9__) && !defined(__HIP__GFX1X__)
#define DOT2C(V0, V2, V3) \
if constexpr (std::is_same_v<scalar_t, half>) { \
asm("v_dot2c_f32_f16 %0, %2, %3" \
: "=v"(V0) \
: "0"(V0), "v"(V2), "v"(V3)); \
} else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) { \
float2 s = __bfloat1622float2(*((__hip_bfloat162*)(&(V2)))) * \
__bfloat1622float2(*((__hip_bfloat162*)(&(V3)))); \
V0 += (s.x + s.y); \
}
#elif defined(__HIP__GFX1X__)
// gfx1x: v_dot2_f32_f16 (VOP3-P, dot10-insts, available on gfx11+gfx12)
#define DOT2C(V0, V2, V3) \
if constexpr (std::is_same_v<scalar_t, half>) { \
asm("v_dot2_f32_f16 %0, %1, %2, %0" : "+v"(V0) : "v"(V2), "v"(V3)); \
} else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) { \
float2 s = __bfloat1622float2(*((__hip_bfloat162*)(&(V2)))) * \
__bfloat1622float2(*((__hip_bfloat162*)(&(V3)))); \
V0 += (s.x + s.y); \
}
#endif
#define DOT2C(V0, V2, V3) \
if constexpr (std::is_same_v<scalar_t, half>) { \
asm("v_dot2c_f32_f16 %0, %2, %3" : "=v"(V0) : "0"(V0), "v"(V2), "v"(V3)); \
} else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) { \
float2 s = __bfloat1622float2(*((__hip_bfloat162*)(&(V2)))) * \
__bfloat1622float2(*((__hip_bfloat162*)(&(V3)))); \
V0 += (s.x + s.y); \
}
// To avoid LLVM silently upcasting to double
__device__ inline unsigned int min__(uint32_t a, uint32_t b) {
return min(a, b);
}
#if defined(__HIP__GFX9__) || defined(__HIP__GFX1X__)
#if defined(__HIP__GFX9__) // TODO: Add NAVI support
// This version targets cases where A[] fits LDS capacity
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
int UNRL, int N>
@@ -482,18 +442,14 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
1); // row_shr2
sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x111, 0xf, 0xf,
1); // row_shr1
#if defined(__HIP__GFX9__)
sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x142, 0xf, 0xf,
1); // ROW_BCAST15
sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x143, 0xf, 0xf,
1); // ROW_BCAST31
#else
sum[n][y] += __shfl_xor(sum[n][y], 16);
#endif
}
}
if (threadIdx.x == (THRDS - 1)) {
if (threadIdx.x == 63) {
scalar_t biases[N][YTILE] = {};
if (BIAS)
for (int n = 0; n < N; n++) {
@@ -513,10 +469,9 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
}
}
} else {
#ifdef __HIP__GFX9__
#pragma unroll
#pragma unroll
for (int n = 0; n < N; n++) {
#pragma unroll
#pragma unroll
for (int y = 0; y < YTILE; y++) {
/*float accm1 = 0;
for (int i=0; i<64; i++)
@@ -543,7 +498,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
sum4[n][y][0] = accm;
}
}
if (threadIdx.x == (THRDS - 1)) {
if (threadIdx.x == 63) {
scalar_t biases[N][YTILE] = {};
if (BIAS)
for (int n = 0; n < N; n++) {
@@ -558,12 +513,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
}
}
}
#endif // __HIP__GFX9__ (MFMA path)
}
m += CuCount * _WvPrGrp * YTILE;
}
}
#else
#else // !defined(__HIP__GFX9__) TODO: Add NAVI support
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
int UNRL, int N>
__global__ void wvSplitK_hf_sml_(const int K, const int Kbp, const int Kap,
@@ -574,9 +528,9 @@ __global__ void wvSplitK_hf_sml_(const int K, const int Kbp, const int Kap,
const int _WvPrGrp, const int CuCount) {
UNREACHABLE_CODE
}
#endif
#endif // defined(__HIP__GFX9__) TODO: Add NAVI support
#if defined(__HIP__GFX9__) || defined(__HIP__GFX1X__)
#if defined(__HIP__GFX9__) // TODO: Add NAVI support
// This version targets cases where A[] marginally exceeds LDS capacity
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
int UNRL, int N>
@@ -703,18 +657,14 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
1); // row_shr2
sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x111, 0xf, 0xf,
1); // row_shr1
#if defined(__HIP__GFX9__)
sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x142, 0xf, 0xf,
1); // ROW_BCAST15
sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x143, 0xf, 0xf,
1); // ROW_BCAST31
#else
sum[n][y] += __shfl_xor(sum[n][y], 16);
#endif
}
}
if (threadIdx.x == (THRDS - 1)) {
if (threadIdx.x == 63) {
scalar_t biases[N][YTILE] = {};
if (BIAS)
for (int n = 0; n < N; n++) {
@@ -736,10 +686,9 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
}
}
} else {
#ifdef __HIP__GFX9__
#pragma unroll
#pragma unroll
for (int n = 0; n < N; n++) {
#pragma unroll
#pragma unroll
for (int y = 0; y < YTILE; y++) {
// float accm1 = 0;
// for (int i=0; i<64; i++)
@@ -764,7 +713,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
sum4[n][y][0] = accm;
}
}
if (threadIdx.x == (THRDS - 1)) {
if (threadIdx.x == 63) {
scalar_t biases[N][YTILE] = {};
if (BIAS)
for (int n = 0; n < N; n++) {
@@ -781,7 +730,6 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
}
}
}
#endif // __HIP__GFX9__ (MFMA path)
}
m += CuCount * _WvPrGrp * YTILE;
@@ -798,7 +746,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
}
}
#else
#else // !defined(__HIP__GFX9__) TODO: Add NAVI support
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
int UNRL, int N>
__global__ void wvSplitK_hf_(const int K, const int Kbp, const int Kap,
@@ -808,9 +756,9 @@ __global__ void wvSplitK_hf_(const int K, const int Kbp, const int Kap,
const int _WvPrGrp, const int CuCount) {
UNREACHABLE_CODE
}
#endif
#endif // defined(__HIP__GFX9__) TODO: Add NAVI support
#if defined(__HIP__GFX9__) || defined(__HIP__GFX1X__)
#if defined(__HIP__GFX9__) // TODO: Add NAVI support
// This version targets big A[] cases, where it is much larger than LDS capacity
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
int UNRL, int N>
@@ -1056,18 +1004,14 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
1); // row_shr2
sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x111, 0xf, 0xf,
1); // row_shr1
#if defined(__HIP__GFX9__)
sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x142, 0xf, 0xf,
1); // ROW_BCAST15
sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x143, 0xf, 0xf,
1); // ROW_BCAST31
#else
sum[n][y] += __shfl_xor(sum[n][y], 16);
#endif
}
}
if (threadIdx.x == (THRDS - 1)) {
if (threadIdx.x == 63) {
scalar_t biases[N][YTILE] = {};
if (BIAS)
for (int n = 0; n < N; n++) {
@@ -1089,10 +1033,9 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
}
}
} else {
#ifdef __HIP__GFX9__
#pragma unroll
#pragma unroll
for (int n = 0; n < N; n++) {
#pragma unroll
#pragma unroll
for (int y = 0; y < YTILE; y++) {
float accm = sum4[n][y][0];
accm += __builtin_amdgcn_mov_dpp(sum4[n][y][1], 0x101, 0xf, 0xf,
@@ -1114,7 +1057,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
sum4[n][y][0] = accm;
}
}
if (threadIdx.x == (THRDS - 1)) {
if (threadIdx.x == 63) {
scalar_t biases[N][YTILE] = {};
if (BIAS)
for (int n = 0; n < N; n++) {
@@ -1131,7 +1074,6 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
}
}
}
#endif // __HIP__GFX9__ (MFMA path)
}
m += CuCount * _WvPrGrp * YTILE;
@@ -1148,7 +1090,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
}
}
}
#else
#else // !defined(__HIP__GFX9__) TODO: Add NAVI support
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
int UNRL, int N>
__global__ void wvSplitK_hf_big_(const int K, const int Kbp, const int Kap,
@@ -1159,7 +1101,7 @@ __global__ void wvSplitK_hf_big_(const int K, const int Kbp, const int Kap,
const int _WvPrGrp, const int CuCount) {
UNREACHABLE_CODE
}
#endif
#endif // defined(__HIP__GFX9__) TODO: Add NAVI support
// Find the min val of div2 that doesn't increase N/(div1*div2)
int mindiv(int N, int div1, int div2) {
@@ -1206,40 +1148,40 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const int max_lds_len = get_lds_size() / 2;
#define WVSPLITK_CFG(_THRDS, _WVPRGRP, _YTILE, _UNRL, _N) \
#define WVSPLITK(_YTILE, _UNRL, _N) \
{ \
dim3 block(_THRDS, _WVPRGRP); \
int __wvPrGrp = mindiv(M_in, CuCount * _YTILE, _WVPRGRP); \
dim3 block(64, 16); \
int __wvPrGrp = mindiv(M_in, CuCount * _YTILE, 16); \
if ((Kbp_in * N_in <= max_lds_len) && (M_in % _YTILE == 0)) \
wvSplitK_hf_sml_<fptype, _THRDS, _YTILE, _WVPRGRP, 8, _UNRL, _N> \
wvSplitK_hf_sml_<fptype, 64, _YTILE, 16, 8, _UNRL, _N> \
<<<grid, block, 0, stream>>>(K_in, Kap_in, Kbp_in, M_in, Bx_in, \
By_in, af4, bf4, biasf4, c, __wvPrGrp, \
CuCount); \
else if (Kbp_in * N_in <= max_lds_len * 1.2) \
wvSplitK_hf_<fptype, _THRDS, _YTILE, _WVPRGRP, 8, _UNRL, _N> \
wvSplitK_hf_<fptype, 64, _YTILE, 16, 8, _UNRL, _N> \
<<<grid, block, 0, stream>>>(K_in, Kap_in, Kbp_in, M_in, Bx_in, \
By_in, af4, bf4, biasf4, c, __wvPrGrp, \
CuCount); \
else \
wvSplitK_hf_big_<fptype, _THRDS, _YTILE, _WVPRGRP, 8, _UNRL, _N> \
wvSplitK_hf_big_<fptype, 64, _YTILE, 16, 8, _UNRL, _N> \
<<<grid, block, 0, stream>>>(K_in, Kap_in, Kbp_in, M_in, Bx_in, \
By_in, af4, bf4, biasf4, c, __wvPrGrp, \
CuCount); \
}
#define WVSPLIT_TILE_CFG(_THRDS, _WVPRGRP, _sYT, __N) \
#define WVSPLIT_TILE(_sYT, __N) \
{ \
bool fit_lds = (Kbp_in * N_in <= max_lds_len); \
if (_sYT <= 1) \
WVSPLITK_CFG(_THRDS, _WVPRGRP, 1, 4, __N) \
WVSPLITK(1, 4, __N) \
else if ((__N == 1) || (!fit_lds) || (_sYT <= 4 * 2)) \
WVSPLITK_CFG(_THRDS, _WVPRGRP, 2, 2, __N) \
WVSPLITK(2, 2, __N) \
else if (_sYT <= 4 * 3) \
WVSPLITK_CFG(_THRDS, _WVPRGRP, 3, 2, __N) \
WVSPLITK(3, 2, __N) \
else if (__N == 4) \
WVSPLITK_CFG(_THRDS, _WVPRGRP, 4, 1, __N) \
WVSPLITK(4, 1, __N) \
else \
WVSPLITK_CFG(_THRDS, _WVPRGRP, 4, 2, __N) \
WVSPLITK(4, 2, __N) \
}
AT_DISPATCH_REDUCED_FLOATING_TYPES(in_b.scalar_type(), "wvSplitK", [&] {
@@ -1256,31 +1198,18 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
// then cut the active waves to balance their distribution...
int sYT = (M_in + CuCount * 4 - 1) / (CuCount * 4);
const bool use_wave32 = on_gfx1x();
switch (N_in) {
case 1:
if (use_wave32)
WVSPLIT_TILE_CFG(32, 16, sYT, 1)
else
WVSPLIT_TILE_CFG(64, 16, sYT, 1)
WVSPLIT_TILE(sYT, 1)
break;
case 2:
if (use_wave32)
WVSPLIT_TILE_CFG(32, 16, sYT, 2)
else
WVSPLIT_TILE_CFG(64, 16, sYT, 2)
WVSPLIT_TILE(sYT, 2)
break;
case 3:
if (use_wave32)
WVSPLIT_TILE_CFG(32, 16, sYT, 3)
else
WVSPLIT_TILE_CFG(64, 16, sYT, 3)
WVSPLIT_TILE(sYT, 3)
break;
case 4:
if (use_wave32)
WVSPLIT_TILE_CFG(32, 16, sYT, 4)
else
WVSPLIT_TILE_CFG(64, 16, sYT, 4)
WVSPLIT_TILE(sYT, 4)
break;
default:
throw std::runtime_error(
@@ -1724,7 +1653,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
#endif
}
}
#else
#else // !defined(__HIP__GFX9__) TODO: Add NAVI support
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
int UNRL, int N, int GrpsShrB, int CHUNKK, int DTRMNSTC>
__global__ void wvSplitKrc_(const int actlN, const int K, const int Kap,
@@ -1759,8 +1688,6 @@ torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
TORCH_CHECK(in_a.dtype() == torch::kFloat16 ||
in_a.dtype() == torch::kBFloat16);
const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a));
auto out_c = torch::empty(
{N_in, M_in},
torch::TensorOptions().dtype(in_a.dtype()).device(in_a.device()));
@@ -1769,6 +1696,7 @@ torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
dim3 grid(CuCount);
const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// const int max_lds_len = get_lds_size() / 2;
@@ -1845,7 +1773,7 @@ torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
return out_c;
}
#if defined(__HIP__MI3XX__) || defined(__HIP__GFX12__)
#if defined(__HIP__MI3XX__) // TODO: Add NAVI support
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
int A_CHUNK, int UNRL, int N>
__global__ void __launch_bounds__(WvPrGrp* THRDS)
@@ -1889,17 +1817,12 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
uint32_t m = (blockIdx.x * _WvPrGrp + (threadIdx.y % _WvPrGrp)) * YTILE;
using floatx16 = __attribute__((__vector_size__(16 * sizeof(float)))) float;
float sA = *s_A;
float sB = *s_B;
while (m < M) {
#ifdef __HIP__GFX12__
// gfx12: per-lane scalar accumulation via v_dot4_f32_fp8_fp8
float sum[N][YTILE] = {};
#else
// gfx9: MFMA accumulation
scalar8 sum[N][YTILE] = {};
#endif
for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
bigType bigA[N][UNRL] = {};
bigType bigB[YTILE][UNRL];
@@ -1931,17 +1854,6 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
#pragma unroll
for (uint32_t k2 = 0; k2 < UNRL; k2++) {
for (uint32_t n = 0; n < N; n++) {
#ifdef __HIP__GFX12__
// gfx12: 4 x dot4 per A_CHUNK=16 bytes (4 FP8 per dot4)
for (int y = 0; y < YTILE; ++y) {
#pragma unroll
for (int i = 0; i < A_CHUNK / 4; i++) {
sum[n][y] = __builtin_amdgcn_dot4_f32_fp8_fp8(
bigA[n][k2].i[i], bigB[y][k2].i[i], sum[n][y]);
}
}
#else
// gfx9: MFMA path
for (int i = 0; i < A_CHUNK; i += 8) {
for (int y = 0; y < YTILE; ++y) {
sum[n][y] = __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8(
@@ -1949,33 +1861,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
0);
}
}
#endif
}
}
}
// Final reduction
#ifdef __HIP__GFX12__
// gfx12 wave32: DPP row_shr within 16-lane rows + cross-row shuffle
for (int n = 0; n < N; n++) {
for (int y = 0; y < YTILE; y++) {
asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:8 bound_ctrl:0 "
: "=v"(sum[n][y])
: "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:4 bound_ctrl:0 "
: "=v"(sum[n][y])
: "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:2 bound_ctrl:0 "
: "=v"(sum[n][y])
: "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:1 bound_ctrl:0 "
: "=v"(sum[n][y])
: "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
sum[n][y] += __shfl_xor(sum[n][y], 16);
}
}
#else
// gfx9 MFMA reduction
for (int n = 0; n < N; n++) {
for (int y = 0; y < YTILE; y++) {
float accm0 = sum[n][y][0];
@@ -1990,15 +1880,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
sum[n][y][0] = accm0;
}
}
#endif
const bool writeback_lane =
#ifdef __HIP__GFX12__
threadIdx.x == (THRDS - 1);
#else
threadIdx.x == 0;
#endif
if (writeback_lane) {
if (threadIdx.x == 0) {
scalar_t biases[N][YTILE] = {};
if (BIAS)
for (int n = 0; n < N; n++) {
@@ -2009,17 +1892,13 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
for (int n = 0; n < N; n++) {
for (int y = 0; y < YTILE; y++) {
if (y + m >= M) break; // To avoid mem access fault.
#ifdef __HIP__GFX12__
float result = sum[n][y] * sA * sB;
#else
float result = sum[n][y][0] * sA * sB;
#endif
sum[n][y][0] *= sA * sB;
if constexpr (std::is_same_v<scalar_t, half>) {
result += __half2float(biases[n][y]);
sum[n][y][0] += __half2float(biases[n][y]);
} else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
result += __bfloat162float(biases[n][y]);
sum[n][y][0] += __bfloat162float(biases[n][y]);
}
C[m + y + n * M] = __float2s<scalar_t>(result);
C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0]);
}
}
}
@@ -2027,7 +1906,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
m += CuCount * _WvPrGrp * YTILE;
}
}
#else // !defined(__HIP__MI3XX__) && !defined(__HIP__GFX12__)
#else // !defined(__HIP__MI3XX__) TODO: Add NAVI support
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
int A_CHUNK, int UNRL, int N>
__global__ void wvSplitKQ_hf_sml_(const int K, const int Kap, const int Kbp,
@@ -2039,9 +1918,9 @@ __global__ void wvSplitKQ_hf_sml_(const int K, const int Kap, const int Kbp,
const int _WvPrGrp, const int CuCount) {
UNREACHABLE_CODE
}
#endif // defined(__HIP__MI3XX__) || defined(__HIP__GFX12__)
#endif // defined(__HIP__MI3XX__) TODO: Add NAVI support
#if defined(__HIP__MI3XX__) || defined(__HIP__GFX12__)
#if defined(__HIP__MI3XX__) // TODO: Add NAVI support
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
int A_CHUNK, int UNRL, int N>
__global__ void __launch_bounds__(WvPrGrp* THRDS)
@@ -2084,17 +1963,12 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
uint32_t m = (blockIdx.x * _WvPrGrp + (threadIdx.y % _WvPrGrp)) * YTILE;
using floatx16 = __attribute__((__vector_size__(16 * sizeof(float)))) float;
float sA = *s_A;
float sB = *s_B;
while (m < M) {
#ifdef __HIP__GFX12__
// gfx12: per-lane scalar accumulation via v_dot4_f32_fp8_fp8
float sum[N][YTILE] = {};
#else
// gfx9: MFMA accumulation
scalar8 sum[N][YTILE] = {};
#endif
for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
bigType bigA[N][UNRL] = {};
bigType bigB[YTILE][UNRL];
@@ -2128,17 +2002,6 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
#pragma unroll
for (uint32_t k2 = 0; k2 < UNRL; k2++) {
for (uint32_t n = 0; n < N; n++) {
#ifdef __HIP__GFX12__
// gfx12: 4 x dot4 per A_CHUNK=16 bytes (4 FP8 per dot4)
for (int y = 0; y < YTILE; ++y) {
#pragma unroll
for (int i = 0; i < A_CHUNK / 4; i++) {
sum[n][y] = __builtin_amdgcn_dot4_f32_fp8_fp8(
bigA[n][k2].i[i], bigB[y][k2].i[i], sum[n][y]);
}
}
#else
// gfx9: MFMA path
for (int i = 0; i < A_CHUNK; i += 8) {
for (int y = 0; y < YTILE; ++y) {
sum[n][y] = __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8(
@@ -2146,33 +2009,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
0);
}
}
#endif
}
}
}
// Final reduction
#ifdef __HIP__GFX12__
// gfx12 wave32: DPP row_shr within 16-lane rows + cross-row shuffle
for (int n = 0; n < N; n++) {
for (int y = 0; y < YTILE; y++) {
asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:8 bound_ctrl:0 "
: "=v"(sum[n][y])
: "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:4 bound_ctrl:0 "
: "=v"(sum[n][y])
: "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:2 bound_ctrl:0 "
: "=v"(sum[n][y])
: "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:1 bound_ctrl:0 "
: "=v"(sum[n][y])
: "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
sum[n][y] += __shfl_xor(sum[n][y], 16);
}
}
#else
// gfx9 MFMA reduction
for (int n = 0; n < N; n++) {
for (int y = 0; y < YTILE; y++) {
float accm0 = sum[n][y][0];
@@ -2187,15 +2028,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
sum[n][y][0] = accm0;
}
}
#endif
const bool writeback_lane =
#ifdef __HIP__GFX12__
threadIdx.x == (THRDS - 1);
#else
threadIdx.x == 0;
#endif
if (writeback_lane) {
if (threadIdx.x == 0) {
scalar_t biases[N][YTILE] = {};
if (BIAS)
for (int n = 0; n < N; n++) {
@@ -2206,17 +2040,13 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
for (int n = 0; n < N; n++) {
for (int y = 0; y < YTILE; y++) {
if (y + m >= M) break; // To avoid mem access fault.
#ifdef __HIP__GFX12__
float result = sum[n][y] * sA * sB;
#else
float result = sum[n][y][0] * sA * sB;
#endif
sum[n][y][0] *= sA * sB;
if constexpr (std::is_same_v<scalar_t, half>) {
result += __half2float(biases[n][y]);
sum[n][y][0] += __half2float(biases[n][y]);
} else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
result += __bfloat162float(biases[n][y]);
sum[n][y][0] += __bfloat162float(biases[n][y]);
}
C[m + y + n * M] = __float2s<scalar_t>(result);
C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0]);
}
}
}
@@ -2224,7 +2054,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
m += CuCount * _WvPrGrp * YTILE;
}
}
#else // !defined(__HIP__MI3XX__) && !defined(__HIP__GFX12__)
#else // !defined(__HIP__MI3XX__) TODO: Add NAVI support
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
int A_CHUNK, int UNRL, int N>
__global__ void wvSplitKQ_hf_(const int K, const int Kap, const int Kbp,
@@ -2236,7 +2066,7 @@ __global__ void wvSplitKQ_hf_(const int K, const int Kap, const int Kbp,
const int CuCount) {
UNREACHABLE_CODE
}
#endif // defined(__HIP__MI3XX__) || defined(__HIP__GFX12__)
#endif // defined(__HIP__MI3XX__) TODO: Add NAVI support
void wvSplitKQ(const at::Tensor& in_b, const at::Tensor& in_a,
const std::optional<at::Tensor>& in_bias, at::Tensor& out_c,
@@ -2269,30 +2099,24 @@ void wvSplitKQ(const at::Tensor& in_b, const at::Tensor& in_a,
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const int max_lds_len = get_lds_size();
#define WVSPLITKQ_IMPL(_THRDS, _WvPrGrp, _YTILEs, _YTILEm, _UNRLs, _UNRLm, _N) \
{ \
dim3 block(_THRDS, _WvPrGrp); \
if ((Kap_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) { \
int __wvPrGrp = min(_WvPrGrp, mindiv(M_in, CuCount * _YTILEs, 16)); \
wvSplitKQ_hf_sml_<fptype, fp8_t, _THRDS, _YTILEs, _WvPrGrp, 16, _UNRLs, \
_N><<<grid, block, 0, stream>>>( \
K_in, Kap_in, Kbp_in, M_in, Bx_in, By_in, b_ptr, a_ptr, bias_ptr, \
c_ptr, s_a, s_b, __wvPrGrp, CuCount); \
} else { \
int __wvPrGrp = min(_WvPrGrp, mindiv(M_in, CuCount * _YTILEm, 16)); \
wvSplitKQ_hf_<fptype, fp8_t, _THRDS, _YTILEm, _WvPrGrp, 16, _UNRLm, _N> \
<<<grid, block, 0, stream>>>(K_in, Kap_in, Kbp_in, M_in, Bx_in, \
By_in, b_ptr, a_ptr, bias_ptr, c_ptr, \
s_a, s_b, __wvPrGrp, CuCount); \
} \
#define WVSPLITKQ(_WvPrGrp, _YTILEs, _YTILEm, _UNRLs, _UNRLm, _N) \
{ \
dim3 block(64, _WvPrGrp); \
if ((Kap_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) { \
int __wvPrGrp = min(_WvPrGrp, mindiv(M_in, CuCount * _YTILEs, 16)); \
wvSplitKQ_hf_sml_<fptype, fp8_t, 64, _YTILEs, _WvPrGrp, 16, _UNRLs, _N> \
<<<grid, block, 0, stream>>>(K_in, Kap_in, Kbp_in, M_in, Bx_in, \
By_in, b_ptr, a_ptr, bias_ptr, c_ptr, \
s_a, s_b, __wvPrGrp, CuCount); \
} else { \
int __wvPrGrp = min(_WvPrGrp, mindiv(M_in, CuCount * _YTILEm, 16)); \
wvSplitKQ_hf_<fptype, fp8_t, 64, _YTILEm, _WvPrGrp, 16, _UNRLm, _N> \
<<<grid, block, 0, stream>>>(K_in, Kap_in, Kbp_in, M_in, Bx_in, \
By_in, b_ptr, a_ptr, bias_ptr, c_ptr, \
s_a, s_b, __wvPrGrp, CuCount); \
} \
}
#define WVSPLITKQ(_WvPrGrp, _YTILEs, _YTILEm, _UNRLs, _UNRLm, _N) \
if (on_gfx12()) \
WVSPLITKQ_IMPL(32, _WvPrGrp, _YTILEs, _YTILEm, _UNRLs, _UNRLm, _N) \
else \
WVSPLITKQ_IMPL(64, _WvPrGrp, _YTILEs, _YTILEm, _UNRLs, _UNRLm, _N)
AT_DISPATCH_REDUCED_FLOATING_TYPES(out_c.scalar_type(), "wvSplitKQ", [&] {
using fptype = typename scalar<scalar_t>::type;
auto c_ptr = reinterpret_cast<fptype*>(out_c.data_ptr());
@@ -2312,10 +2136,10 @@ void wvSplitKQ(const at::Tensor& in_b, const at::Tensor& in_a,
WVSPLITKQ(16, 2, 2, 2, 2, 2)
break;
case 3:
WVSPLITKQ(16, 2, 2, 1, 1, 3)
WVSPLITKQ(16, 2, 2, 2, 2, 3)
break;
case 4:
WVSPLITKQ(16, 2, 2, 1, 1, 4)
WVSPLITKQ(16, 2, 2, 2, 2, 4)
break;
default:
throw std::runtime_error(

View File

@@ -303,6 +303,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
") -> Tensor");
// conditionally compiled so impl registration is in source file
ops.def("permute_cols(Tensor A, Tensor perm) -> Tensor");
ops.impl("permute_cols", torch::kCUDA, &permute_cols);
// Marlin Optimized Quantized GEMM (supports GPTQ, AWQ, FP8, NVFP4, MXFP4).
ops.def(
"marlin_gemm(Tensor a, Tensor? c_or_none, Tensor b_q_weight, "
@@ -486,8 +489,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" Tensor! problem_sizes1, Tensor! problem_sizes2, "
" Tensor! input_permutation, "
" Tensor! output_permutation, int num_experts, "
" int n, int k, Tensor? blockscale_offsets, "
" bool is_gated) -> ()");
" int n, int k, Tensor? blockscale_offsets) -> "
"()");
ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data);
// compute per-expert problem sizes from expert_first_token_offset

View File

@@ -44,7 +44,7 @@ ENV DEBIAN_FRONTEND=noninteractive
# Install Python and other dependencies
RUN apt-get update -y \
&& apt-get install -y software-properties-common git curl sudo vim less libgfortran5 libopenmpi-dev libpci-dev liblzma-dev pkg-config \
&& apt-get install -y software-properties-common git curl sudo vim less libgfortran5 libopenmpi-dev libpci-dev \
&& for i in 1 2 3; do \
add-apt-repository -y ppa:deadsnakes/ppa && break || \
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \

View File

@@ -76,22 +76,19 @@ ENV UV_LINK_MODE="copy"
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,src=requirements/common.txt,target=/workspace/vllm/requirements/common.txt \
--mount=type=bind,src=requirements/xpu.txt,target=/workspace/vllm/requirements/xpu.txt \
--mount=type=bind,src=requirements/xpu-test.in,target=/workspace/vllm/requirements/xpu-test.in \
uv pip install --upgrade pip && \
uv pip install -r requirements/xpu.txt && \
uv pip compile /workspace/vllm/requirements/xpu-test.in \
-o /workspace/vllm/requirements/xpu-test.txt \
-c /workspace/vllm/requirements/xpu.txt \
--index-strategy unsafe-best-match \
--extra-index-url ${PIP_EXTRA_INDEX_URL} \
--python-version ${PYTHON_VERSION} && \
uv pip install grpcio-tools protobuf nanobind && \
uv pip install -r requirements/xpu.txt
# used for suffix method speculative decoding
# build deps for proto + nanobind-based extensions to set up the build environment
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install grpcio-tools protobuf nanobind
# arctic-inference is built from source which needs torch-xpu properly installed first
RUN --mount=type=cache,target=/root/.cache/uv \
source /opt/intel/oneapi/setvars.sh --force && \
source /opt/intel/oneapi/ccl/2021.15/env/vars.sh --force && \
export CMAKE_PREFIX_PATH="$(python3 -c 'import site; print(site.getsitepackages()[0])'):${CMAKE_PREFIX_PATH}" && \
uv pip install --no-build-isolation -r /workspace/vllm/requirements/xpu-test.txt
export CMAKE_PREFIX_PATH="$(python -c 'import site; print(site.getsitepackages()[0])'):${CMAKE_PREFIX_PATH}" && \
uv pip install --no-build-isolation arctic-inference==0.1.1
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"

View File

@@ -25,7 +25,7 @@ nav:
- Models:
- models/supported_models.md
- models/generative_models.md
- Pooling Models: models/pooling_models
- models/pooling_models.md
- models/extensions
- Hardware Supported Models:
- models/hardware_supported_models/*

View File

@@ -37,7 +37,7 @@ For [generative models](../../models/generative_models.md), there are two levels
#### Pooling models
For [pooling models](../../models/pooling_models/README.md), we simply check the cosine similarity, as defined in [tests/models/utils.py](../../../tests/models/utils.py).
For [pooling models](../../models/pooling_models.md), we simply check the cosine similarity, as defined in [tests/models/utils.py](../../../tests/models/utils.py).
### Multi-modal processing

View File

@@ -3,10 +3,6 @@
!!! warning
Profiling is only intended for vLLM developers and maintainers to understand the proportion of time spent in different parts of the codebase. **vLLM end-users should never turn on profiling** as it will significantly slow down the inference.
!!! tip "Choosing a profiler"
- Use **Nsight Systems** for low-overhead, performance-critical profiling.
- Use **PyTorch Profiler** for medium-overhead profiling with richer debugging information (e.g., stack traces, memory, shapes). Note that enabling these features adds overhead and is not recommended for benchmarking.
## Profile with PyTorch Profiler
We support tracing vLLM workers using different profilers. You can enable profiling by setting the `--profiler-config` flag when launching the server.

View File

@@ -127,8 +127,8 @@ Priority is **1 = highest** (tried first).
| 3 | `FLASH_ATTN_MLA` |
| 4 | `FLASHMLA` |
| 5 | `TRITON_MLA` |
| 6 | `FLASHINFER_MLA_SPARSE`**\*** |
| 7 | `FLASHMLA_SPARSE` |
| 6 | `FLASHMLA_SPARSE` |
| 7 | `FLASHINFER_MLA_SPARSE` |
**Ampere/Hopper (SM 8.x-9.x):**
@@ -140,8 +140,6 @@ Priority is **1 = highest** (tried first).
| 4 | `TRITON_MLA` |
| 5 | `FLASHMLA_SPARSE` |
> **\*** For sparse MLA, FP8 KV cache always prefers `FLASHINFER_MLA_SPARSE`. With BF16 KV cache, `FLASHINFER_MLA_SPARSE` is preferred for low query-head counts (<= 16), while `FLASHMLA_SPARSE` is preferred otherwise.
>
> **Note:** ROCm and CPU platforms have their own selection logic. See the platform-specific documentation for details.
## Legend

View File

@@ -51,8 +51,11 @@ For example:
**1. Attention:**
```python
--8<-- "vllm/model_executor/layers/attention/mm_encoder_attention.py:mm_encoder_attn"
--8<-- "vllm/model_executor/layers/mla.py:multi_head_latent_attention"
--8<-- "vllm/model_executor/models/deepencoder.py:rel_pos_attention"
```
**2. Activation:**
@@ -167,16 +170,6 @@ For example:
--8<-- "vllm/model_executor/layers/rotary_embedding/common.py:apply_rotary_emb"
```
**12. Encoder:**
```python
--8<-- "vllm/model_executor/models/deepencoder2.py:qwen2_decoder"
--8<-- "vllm/model_executor/layers/attention/mm_encoder_attention.py:mm_encoder_attn"
--8<-- "vllm/model_executor/models/deepencoder.py:rel_pos_attention"
```
## Guidelines for Implementing a New CustomOp
### Implement a New CustomOp in vLLM

View File

@@ -88,8 +88,8 @@ To be used with a particular `FusedMoEPrepareAndFinalizeModular` subclass, MoE k
| flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
| gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] |
| marlin | standard,</br>batched | <sup>3</sup> / N/A | <sup>3</sup> / N/A | silu,</br>swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] |
| trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmMxfp4ExpertsMonolithic`][vllm.model_executor.layers.fused_moe.experts.trtllm_mxfp4_moe.TrtLlmMxfp4ExpertsMonolithic],</br>[`TrtLlmMxfp4ExpertsModular`][vllm.model_executor.layers.fused_moe.experts.trtllm_mxfp4_moe.TrtLlmMxfp4ExpertsModular],</br>[`TrtLlmNvFp4ExpertsMonolithic`][vllm.model_executor.layers.fused_moe.experts.trtllm_nvfp4_moe.TrtLlmNvFp4ExpertsMonolithic],</br>[`TrtLlmNvfp4ExpertsModular`][vllm.model_executor.layers.fused_moe.experts.trtllm_nvfp4_moe.TrtLlmNvFp4ExpertsModular] |
| rocm aiter moe | standard | mxfp4,</br>fp8 | G(32),G(128),A,T | silu, gelu,</br>swigluoai | Y | N | `rocm_aiter_fused_experts`,</br>`AiterExperts` |
| trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] |
| rocm aiter moe | standard | fp8 | G(128),A,T | silu, gelu | Y | N | [`rocm_aiter_fused_experts`][vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe.rocm_aiter_fused_experts] |
| cpu_fused_moe | standard | N/A | N/A | silu | N | N | [`CPUFusedMOE`][vllm.model_executor.layers.fused_moe.cpu_fused_moe.CPUFusedMOE] |
| naive batched<sup>4</sup> | batched | int8,</br>fp8 | G,A,T | silu, gelu | <sup>6</sup> | Y | [`NaiveBatchedExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.NaiveBatchedExperts] |
@@ -103,7 +103,7 @@ To be used with a particular `FusedMoEPrepareAndFinalizeModular` subclass, MoE k
## Modular Kernel "families"
The following table shows "families" of modular kernels that are intended to work together. There are some combinations which may work but have not yet been tested, e.g. flashinfer with other fp8 experts.
The following table shows "families" of modular kernels that are intended to work together. There are some combinations which may work but have not yet been tested, e.g. flashinfer with other fp8 experts. Note that the "naive" backend will work with any non-modular experts.
| backend | `FusedMoEPrepareAndFinalizeModular` subclasses | `FusedMoEExpertsModular` subclasses |
| ------- | ---------------------------------------------- | ----------------------------------- |

View File

@@ -29,9 +29,10 @@ To compile a multimodal component such as an encoder, we follow the same mechani
1. The `@support_torch_compile` decorator should include `enable_if=should_torch_compile_mm_encoder`. This will gate the compilation behind our
`compile_mm_encoder` configuration
2. The `@support_torch_compile` decorator should include `is_encoder=True` for encoder components. This is needed for compile range integration
(see Compile Range Integration). The decorator automatically uses the class name as the cache directory prefix, avoiding collisions between
independently compiled sub-modules (e.g. vision encoder components vs the text backbone).
2. `with set_model_tag("<component_name>", is_encoder=True)` context manager should be used around the nn.Module's instantiation. Since torch.compile
relies on caching artifacts to reduce start time, we must properly propagate the `<component_name>` information to the cache in order to avoid collisions
with the LLM text-backbone, or other instances of the same artifact (as is the case with vision block). `is_encoder=True` is also needed for encoder
components (see Compile Range Integration).
### CompilationConfig
@@ -56,8 +57,8 @@ tradeoff
### Compile ranges
The torch.compile integration will try to rely on max_batch_size to infer compilation ranges for dynamic shapes; however, for modules used in the encoder, this
shape can be difficult to infer due to the unspecified range of shapes the encoder may see as input. Therefore, we rely on `is_encoder=True` in the
`@support_torch_compile` decorator to alert torch.compile to the fact that this range cannot be inferred, and we default to the range (1, MAX_INT).
shape can be difficult to infer due to the unspecified range of shapes the encoder may see as input. Therefore, we rely on `is_encoder=True` in the `set_model_tag`
to alert torch.compile to the fact that this range cannot be inferred, and we default to the range (1, MAX_INT).
!!! note
We may seek to tighten this range for better performance in the future

View File

@@ -36,14 +36,14 @@ th:not(:first-child) {
}
</style>
| Feature | [CP](../configuration/optimization.md#chunked-prefill) | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](speculative_decoding/README.md) | CUDA graph | [pooling](../models/pooling_models/README.md) | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search | [prompt-embeds](prompt_embeds.md) |
| Feature | [CP](../configuration/optimization.md#chunked-prefill) | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](speculative_decoding/README.md) | CUDA graph | [pooling](../models/pooling_models.md) | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search | [prompt-embeds](prompt_embeds.md) |
| - | - | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| [CP](../configuration/optimization.md#chunked-prefill) | ✅ | | | | | | | | | | | | | | |
| [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | | |
| [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | |
| [SD](speculative_decoding/README.md) | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | | |
| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | |
| [pooling](../models/pooling_models/README.md) | 🟠\* | 🟠\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | | |
| [pooling](../models/pooling_models.md) | 🟠\* | 🟠\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | | |
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [](https://github.com/vllm-project/vllm/issues/7366) | ❌ | [](https://github.com/vllm-project/vllm/issues/7366) | ✅ | ✅ | ✅ | | | | | | | | |
| <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | |
| <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | | |
@@ -66,7 +66,7 @@ th:not(:first-child) {
| [LoRA](lora.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [SD](speculative_decoding/README.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | [](https://github.com/vllm-project/vllm/issues/26970) |
| [pooling](../models/pooling_models/README.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [pooling](../models/pooling_models.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ |
| [mm](multimodal_inputs.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ |

View File

@@ -389,17 +389,3 @@ vllm serve model --enable-lora --max-lora-rank 64
# Bad: unnecessarily high, wastes memory
vllm serve model --enable-lora --max-lora-rank 256
```
### Restricting LoRA to Specific Modules
The `--lora-target-modules` parameter allows you to restrict which model modules have LoRA applied at deployment time. This is useful for performance tuning when you only need LoRA on specific layers:
```bash
# Apply LoRA only to output projection layers
vllm serve model --enable-lora --lora-target-modules o_proj
# Apply LoRA to multiple specific modules
vllm serve model --enable-lora --lora-target-modules o_proj qkv_proj down_proj
```
When `--lora-target-modules` is not specified, LoRA will be applied to all supported modules in the model. This parameter accepts module suffixes (the last component of the module name), such as `o_proj`, `qkv_proj`, `gate_proj`, etc.

View File

@@ -5,7 +5,7 @@ vLLM offers support for reasoning models like [DeepSeek R1](https://huggingface.
Reasoning models return an additional `reasoning` field in their outputs, which contains the reasoning steps that led to the final conclusion. This field is not present in the outputs of other models.
!!! warning
`reasoning` used to be called `reasoning_content`. To migrate, directly replace `reasoning_content` with `reasoning`.
`reasoning` used to be called `reasoning_content`. For now, `reasoning_content` will continue to work. However, we encourage you to migrate to `reasoning` in case `reasoning_content` is removed in future.
## Supported Models

View File

@@ -107,27 +107,6 @@ vLLM supports the `tool_choice='none'` option in the chat completion API. When t
!!! note
When tools are specified in the request, vLLM includes tool definitions in the prompt by default, regardless of the `tool_choice` setting. To exclude tool definitions when `tool_choice='none'`, use the `--exclude-tools-when-tool-choice-none` option.
## Constrained Decoding Behavior
Whether vLLM enforces the tool parameter schema during generation depends on the `tool_choice` mode:
| `tool_choice` value | Schema-constrained decoding | Behavior |
| --- | --- | --- |
| Named function | Yes (via structured outputs backend) | Arguments are guaranteed to be valid JSON conforming to the function's parameter schema. |
| `"required"` | Yes (via structured outputs backend) | Same as named function. The model must produce at least one tool call. |
| `"auto"` | No | The model generates freely. A tool-call parser extracts tool calls from the raw text. Arguments may be malformed or not match the schema. |
| `"none"` | N/A | No tool calls are produced. |
When schema conformance matters, prefer `tool_choice="required"` or named function calling over `"auto"`.
### Strict Mode (`strict` parameter)
The [OpenAI API](https://platform.openai.com/docs/guides/function-calling#strict-mode) supports a `strict` field on function definitions. When set to `true`, OpenAI uses constrained decoding to guarantee that tool-call arguments match the function schema, even in `tool_choice="auto"` mode.
vLLM **does not implement** `strict` mode today. The `strict` field is accepted in requests (to avoid breaking clients that set it), but it has no effect on decoding behavior. In auto mode, argument validity depends entirely on the model's output quality and the parser's extraction logic.
Tracking issues: [#15526](https://github.com/vllm-project/vllm/issues/15526), [#16313](https://github.com/vllm-project/vllm/issues/16313).
## Automatic Function Calling
To enable this feature, you should set the following flags:
@@ -145,9 +124,6 @@ from HuggingFace; and you can find an example of this in a `tokenizer_config.jso
If your favorite tool-calling model is not supported, please feel free to contribute a parser & tool use chat template!
!!! note
With `tool_choice="auto"`, tool-call arguments are extracted from the model's raw text output by the selected parser. No schema-level constraint is applied during decoding, so arguments may occasionally be malformed or violate the function's parameter schema. See [Constrained Decoding Behavior](#constrained-decoding-behavior) for details.
### Hermes Models (`hermes`)
All Nous Research Hermes-series models newer than Hermes 2 Pro should be supported.

View File

@@ -23,18 +23,15 @@ def title(text: str) -> str:
# Custom substitutions
subs = {
"io": "IO",
"rl": "RL",
"api(s?)": r"API\1",
"api": "API",
"cli": "CLI",
"cpu": "CPU",
"ipc": "IPC",
"llm": "LLM",
"mae": "MAE",
"ner": "NER",
"tpu": "TPU",
"gguf": "GGUF",
"lora": "LoRA",
"nccl": "NCCL",
"rlhf": "RLHF",
"vllm": "vLLM",
"openai": "OpenAI",
@@ -199,11 +196,6 @@ class Example:
def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
# Monkey-patch dirname_to_title in awesome-nav so that sub-directory names are
# title-cased (e.g. "Offline Inference" instead of "Offline inference").
import mkdocs_awesome_nav.nav.directory as _nav_dir
_nav_dir.dirname_to_title = title
logger.info("Generating example documentation")
logger.debug("Root directory: %s", ROOT_DIR.resolve())
logger.debug("Example directory: %s", EXAMPLE_DIR.resolve())

View File

@@ -1,8 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
MkDocs hook + markdown extension to enable the following links to render correctly,
including inside content included via pymdownx.snippets:
MkDocs hook to enable the following links to render correctly:
- Relative file links outside of the `docs/` directory, e.g.:
- [Text](../some_file.py)
@@ -13,17 +12,13 @@ including inside content included via pymdownx.snippets:
e.g. <...pull/123> -> [Pull Request #123](.../pull/123)
- Works for external repos too by including the `owner/repo` in the link title
The link replacement runs as a markdown preprocessor (priority 25) so that it executes
after pymdownx.snippets (priority 32) has expanded all included content.
The on_page_markdown hook passes the current page context to the preprocessor before
each page is converted.
The goal is to simplify cross-referencing common GitHub resources
in project docs.
"""
from pathlib import Path
import regex as re
from markdown import Extension
from markdown.preprocessors import Preprocessor
from mkdocs.config.defaults import MkDocsConfig
from mkdocs.structure.files import Files
from mkdocs.structure.pages import Page
@@ -31,6 +26,7 @@ from mkdocs.structure.pages import Page
ROOT_DIR = Path(__file__).parent.parent.parent.parent.resolve()
DOC_DIR = ROOT_DIR / "docs"
gh_icon = ":octicons-mark-github-16:"
# Regex pieces
@@ -52,90 +48,46 @@ github_link = re.compile(rf"(\[{TITLE}\]\(|<){URL}(\)|>)")
relative_link = re.compile(rf"\[{TITLE}\]\({RELATIVE}\)")
class UrlSchemesPreprocessor(Preprocessor):
"""Preprocessor that runs after pymdownx.snippets to process all links."""
def __init__(self, md, ext):
super().__init__(md)
self.ext = ext
def run(self, lines):
page = self.ext.page
if page is None or getattr(page.file, "abs_src_path", None) is None:
return lines
def replace_relative_link(match: re.Match) -> str:
"""
Replace relative file links with URLs if they point outside the docs dir.
"""
title = match.group("title")
path = match.group("path")
path = (Path(page.file.abs_src_path).parent / path).resolve()
fragment = match.group("fragment") or ""
# Check if the path exists and is outside the docs dir
if not path.exists() or path.is_relative_to(DOC_DIR):
return match.group(0)
# Files and directories have different URL schemes on GitHub
slug = "tree/main" if path.is_dir() else "blob/main"
path = path.relative_to(ROOT_DIR)
url = f"https://github.com/vllm-project/vllm/{slug}/{path}{fragment}"
return f"[{gh_icon} {title}]({url})"
def replace_github_link(match: re.Match) -> str:
"""
Replace GitHub issue, PR, and project links with enhanced Markdown links.
"""
repo = match.group("repo")
type = match.group("type")
number = match.group("number")
# Title and fragment could be None
title = match.group("title") or ""
fragment = match.group("fragment") or ""
# Use default titles for raw links
if not title:
title = TITLES[type]
if "vllm-project" not in repo:
title += repo
title += f"#{number}"
url = f"https://github.com/{repo}/{type}/{number}{fragment}"
return f"[{gh_icon} {title}]({url})"
markdown = "\n".join(lines)
markdown = relative_link.sub(replace_relative_link, markdown)
markdown = github_link.sub(replace_github_link, markdown)
return markdown.split("\n")
class UrlSchemesExtension(Extension):
"""Markdown extension that registers the URL schemes preprocessor."""
def __init__(self, **kwargs):
self.page = None
super().__init__(**kwargs)
def extendMarkdown(self, md):
# Priority 25 runs after pymdownx.snippets (priority 32)
md.preprocessors.register(UrlSchemesPreprocessor(md, self), "url_schemes", 25)
# Singleton extension instance shared between the hook and the preprocessor.
_ext = UrlSchemesExtension()
def on_config(config: MkDocsConfig) -> MkDocsConfig:
"""Register the URL schemes markdown extension."""
config["markdown_extensions"].append(_ext)
return config
def on_page_markdown(
markdown: str, *, page: Page, config: MkDocsConfig, files: Files
) -> str:
"""Pass the current page context to the preprocessor."""
_ext.page = page
def replace_relative_link(match: re.Match) -> str:
"""Replace relative file links with URLs if they point outside the docs dir."""
title = match.group("title")
path = match.group("path")
path = (Path(page.file.abs_src_path).parent / path).resolve()
fragment = match.group("fragment") or ""
# Check if the path exists and is outside the docs dir
if not path.exists() or path.is_relative_to(DOC_DIR):
return match.group(0)
# Files and directories have different URL schemes on GitHub
slug = "tree/main" if path.is_dir() else "blob/main"
path = path.relative_to(ROOT_DIR)
url = f"https://github.com/vllm-project/vllm/{slug}/{path}{fragment}"
return f"[{gh_icon} {title}]({url})"
def replace_github_link(match: re.Match) -> str:
"""Replace GitHub issue, PR, and project links with enhanced Markdown links."""
repo = match.group("repo")
type = match.group("type")
number = match.group("number")
# Title and fragment could be None
title = match.group("title") or ""
fragment = match.group("fragment") or ""
# Use default titles for raw links
if not title:
title = TITLES[type]
if "vllm-project" not in repo:
title += repo
title += f"#{number}"
url = f"https://github.com/{repo}/{type}/{number}{fragment}"
return f"[{gh_icon} {title}]({url})"
markdown = relative_link.sub(replace_relative_link, markdown)
markdown = github_link.sub(replace_github_link, markdown)
return markdown

View File

@@ -0,0 +1,676 @@
# Pooling Models
vLLM also supports pooling models, such as embedding, classification, and reward models.
In vLLM, pooling models implement the [VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface.
These models use a [Pooler][vllm.model_executor.layers.pooler.Pooler] to extract the final hidden states of the input
before returning them.
!!! note
We currently support pooling models primarily for convenience. This is not guaranteed to provide any performance improvements over using Hugging Face Transformers or Sentence Transformers directly.
We plan to optimize pooling models in vLLM. Please comment on <https://github.com/vllm-project/vllm/issues/21796> if you have any suggestions!
## Configuration
### Model Runner
Run a model in pooling mode via the option `--runner pooling`.
!!! tip
There is no need to set this option in the vast majority of cases as vLLM can automatically
detect the appropriate model runner via `--runner auto`.
### Model Conversion
vLLM can adapt models for various pooling tasks via the option `--convert <type>`.
If `--runner pooling` has been set (manually or automatically) but the model does not implement the
[VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface,
vLLM will attempt to automatically convert the model according to the architecture names
shown in the table below.
| Architecture | `--convert` | Supported pooling tasks |
| ----------------------------------------------- | ----------- | ------------------------------------- |
| `*ForTextEncoding`, `*EmbeddingModel`, `*Model` | `embed` | `token_embed`, `embed` |
| `*ForRewardModeling`, `*RewardModel` | `embed` | `token_embed`, `embed` |
| `*For*Classification`, `*ClassificationModel` | `classify` | `token_classify`, `classify`, `score` |
!!! tip
You can explicitly set `--convert <type>` to specify how to convert the model.
### Pooling Tasks
Each pooling model in vLLM supports one or more of these tasks according to
[Pooler.get_supported_tasks][vllm.model_executor.layers.pooler.Pooler.get_supported_tasks],
enabling the corresponding APIs:
| Task | APIs |
| ---------------- | ----------------------------------------------------------------------------- |
| `embed` | `LLM.embed(...)`, `LLM.score(...)`\*, `LLM.encode(..., pooling_task="embed")` |
| `classify` | `LLM.classify(...)`, `LLM.encode(..., pooling_task="classify")` |
| `score` | `LLM.score(...)` |
| `token_classify` | `LLM.reward(...)`, `LLM.encode(..., pooling_task="token_classify")` |
| `token_embed` | `LLM.encode(..., pooling_task="token_embed")` |
| `plugin` | `LLM.encode(..., pooling_task="plugin")` |
\* The `LLM.score(...)` API falls back to `embed` task if the model does not support `score` task.
### Pooler Configuration
#### Predefined models
If the [Pooler][vllm.model_executor.layers.pooler.Pooler] defined by the model accepts `pooler_config`,
you can override some of its attributes via the `--pooler-config` option.
#### Converted models
If the model has been converted via `--convert` (see above),
the pooler assigned to each task has the following attributes by default:
| Task | Pooling Type | Normalization | Softmax |
| ---------- | ------------ | ------------- | ------- |
| `embed` | `LAST` | ✅︎ | ❌ |
| `classify` | `LAST` | ❌ | ✅︎ |
When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models,
its Sentence Transformers configuration file (`modules.json`) takes priority over the model's defaults.
You can further customize this via the `--pooler-config` option,
which takes priority over both the model's and Sentence Transformers' defaults.
## Offline Inference
The [LLM][vllm.LLM] class provides various methods for offline inference.
See [configuration](../api/README.md#configuration) for a list of options when initializing the model.
### `LLM.embed`
The [embed][vllm.LLM.embed] method outputs an embedding vector for each prompt.
It is primarily designed for embedding models.
```python
from vllm import LLM
llm = LLM(model="intfloat/e5-small", runner="pooling")
(output,) = llm.embed("Hello, my name is")
embeds = output.outputs.embedding
print(f"Embeddings: {embeds!r} (size={len(embeds)})")
```
A code example can be found here: [examples/basic/offline_inference/embed.py](../../examples/basic/offline_inference/embed.py)
### `LLM.classify`
The [classify][vllm.LLM.classify] method outputs a probability vector for each prompt.
It is primarily designed for classification models.
```python
from vllm import LLM
llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", runner="pooling")
(output,) = llm.classify("Hello, my name is")
probs = output.outputs.probs
print(f"Class Probabilities: {probs!r} (size={len(probs)})")
```
A code example can be found here: [examples/basic/offline_inference/classify.py](../../examples/basic/offline_inference/classify.py)
### `LLM.score`
The [score][vllm.LLM.score] method outputs similarity scores between sentence pairs.
It is designed for embedding models and cross-encoder models. Embedding models use cosine similarity, and [cross-encoder models](https://www.sbert.net/examples/applications/cross-encoder/README.html) serve as rerankers between candidate query-document pairs in RAG systems.
!!! note
vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG.
To handle RAG at a higher level, you should use integration frameworks such as [LangChain](https://github.com/langchain-ai/langchain).
```python
from vllm import LLM
llm = LLM(model="BAAI/bge-reranker-v2-m3", runner="pooling")
(output,) = llm.score(
"What is the capital of France?",
"The capital of Brazil is Brasilia.",
)
score = output.outputs.score
print(f"Score: {score}")
```
A code example can be found here: [examples/basic/offline_inference/score.py](../../examples/basic/offline_inference/score.py)
### `LLM.reward`
The [reward][vllm.LLM.reward] method is available to all reward models in vLLM.
```python
from vllm import LLM
llm = LLM(model="internlm/internlm2-1_8b-reward", runner="pooling", trust_remote_code=True)
(output,) = llm.reward("Hello, my name is")
data = output.outputs.data
print(f"Data: {data!r}")
```
A code example can be found here: [examples/basic/offline_inference/reward.py](../../examples/basic/offline_inference/reward.py)
### `LLM.encode`
The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM.
!!! note
Please use one of the more specific methods or set the task directly when using `LLM.encode`:
- For embeddings, use `LLM.embed(...)` or `pooling_task="embed"`.
- For classification logits, use `LLM.classify(...)` or `pooling_task="classify"`.
- For similarity scores, use `LLM.score(...)`.
- For rewards, use `LLM.reward(...)` or `pooling_task="token_classify"`.
- For token classification, use `pooling_task="token_classify"`.
- For multi-vector retrieval, use `pooling_task="token_embed"`.
- For IO Processor Plugins, use `pooling_task="plugin"`.
```python
from vllm import LLM
llm = LLM(model="intfloat/e5-small", runner="pooling")
(output,) = llm.encode("Hello, my name is", pooling_task="embed")
data = output.outputs.data
print(f"Data: {data!r}")
```
## Online Serving
Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs:
- [Embeddings API](../serving/openai_compatible_server.md#embeddings-api) is similar to `LLM.embed`, accepting both text and [multi-modal inputs](../features/multimodal_inputs.md) for embedding models.
- [Classification API](../serving/openai_compatible_server.md#classification-api) is similar to `LLM.classify` and is applicable to sequence classification models.
- [Score API](../serving/openai_compatible_server.md#score-api) is similar to `LLM.score` for cross-encoder models.
- [Pooling API](../serving/openai_compatible_server.md#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models.
!!! note
Please use one of the more specific endpoints or set the task directly when using the [Pooling API](../serving/openai_compatible_server.md#pooling-api):
- For embeddings, use [Embeddings API](../serving/openai_compatible_server.md#embeddings-api) or `"task":"embed"`.
- For classification logits, use [Classification API](../serving/openai_compatible_server.md#classification-api) or `"task":"classify"`.
- For similarity scores, use [Score API](../serving/openai_compatible_server.md#score-api).
- For rewards, use `"task":"token_classify"`.
- For token classification, use `"task":"token_classify"`.
- For multi-vector retrieval, use `"task":"token_embed"`.
- For IO Processor Plugins, use `"task":"plugin"`.
```python
# start a supported embeddings model server with `vllm serve`, e.g.
# vllm serve intfloat/e5-small
import requests
host = "localhost"
port = "8000"
model_name = "intfloat/e5-small"
api_url = f"http://{host}:{port}/pooling"
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
prompt = {"model": model_name, "input": prompts, "task": "embed"}
response = requests.post(api_url, json=prompt)
for output in response.json()["data"]:
data = output["data"]
print(f"Data: {data!r} (size={len(data)})")
```
## Matryoshka Embeddings
[Matryoshka Embeddings](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings) or [Matryoshka Representation Learning (MRL)](https://arxiv.org/abs/2205.13147) is a technique used in training embedding models. It allows users to trade off between performance and cost.
!!! warning
Not all embedding models are trained using Matryoshka Representation Learning. To avoid misuse of the `dimensions` parameter, vLLM returns an error for requests that attempt to change the output dimension of models that do not support Matryoshka Embeddings.
For example, setting `dimensions` parameter while using the `BAAI/bge-m3` model will result in the following error.
```json
{"object":"error","message":"Model \"BAAI/bge-m3\" does not support matryoshka representation, changing output dimensions will lead to poor results.","type":"BadRequestError","param":null,"code":400}
```
### Manually enable Matryoshka Embeddings
There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, if `is_matryoshka` is `True` in `config.json`, you can change the output dimension to arbitrary values. Use `matryoshka_dimensions` to control the allowed output dimensions.
For models that support Matryoshka Embeddings but are not recognized by vLLM, manually override the config using `hf_overrides={"is_matryoshka": True}` or `hf_overrides={"matryoshka_dimensions": [<allowed output dimensions>]}` (offline), or `--hf-overrides '{"is_matryoshka": true}'` or `--hf-overrides '{"matryoshka_dimensions": [<allowed output dimensions>]}'` (online).
Here is an example to serve a model with Matryoshka Embeddings enabled.
```bash
vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf-overrides '{"matryoshka_dimensions":[256]}'
```
### Offline Inference
You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter in [PoolingParams][vllm.PoolingParams].
```python
from vllm import LLM, PoolingParams
llm = LLM(
model="jinaai/jina-embeddings-v3",
runner="pooling",
trust_remote_code=True,
)
outputs = llm.embed(
["Follow the white rabbit."],
pooling_params=PoolingParams(dimensions=32),
)
print(outputs[0].outputs)
```
A code example can be found here: [examples/pooling/embed/embed_matryoshka_fy_offline.py](../../examples/pooling/embed/embed_matryoshka_fy_offline.py)
### Online Inference
Use the following command to start the vLLM server.
```bash
vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
```
You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter.
```bash
curl http://127.0.0.1:8000/v1/embeddings \
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-d '{
"input": "Follow the white rabbit.",
"model": "jinaai/jina-embeddings-v3",
"encoding_format": "float",
"dimensions": 32
}'
```
Expected output:
```json
{"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}}
```
An OpenAI client example can be found here: [examples/pooling/embed/openai_embedding_matryoshka_fy_client.py](../../examples/pooling/embed/openai_embedding_matryoshka_fy_client.py)
## Specific models
### ColBERT Late Interaction Models
[ColBERT](https://arxiv.org/abs/2004.12832) (Contextualized Late Interaction over BERT) is a retrieval model that uses per-token embeddings and MaxSim scoring for document ranking. Unlike single-vector embedding models, ColBERT retains token-level representations and computes relevance scores through late interaction, providing better accuracy while being more efficient than cross-encoders.
vLLM supports ColBERT models with multiple encoder backbones:
| Architecture | Backbone | Example HF Models |
| - | - | - |
| `HF_ColBERT` | BERT | `answerdotai/answerai-colbert-small-v1`, `colbert-ir/colbertv2.0` |
| `ColBERTModernBertModel` | ModernBERT | `lightonai/GTE-ModernColBERT-v1` |
| `ColBERTJinaRobertaModel` | Jina XLM-RoBERTa | `jinaai/jina-colbert-v2` |
**BERT-based ColBERT** models work out of the box:
```shell
vllm serve answerdotai/answerai-colbert-small-v1
```
For **non-BERT backbones**, use `--hf-overrides` to set the correct architecture:
```shell
# ModernBERT backbone
vllm serve lightonai/GTE-ModernColBERT-v1 \
--hf-overrides '{"architectures": ["ColBERTModernBertModel"]}'
# Jina XLM-RoBERTa backbone
vllm serve jinaai/jina-colbert-v2 \
--hf-overrides '{"architectures": ["ColBERTJinaRobertaModel"]}' \
--trust-remote-code
```
Then you can use the rerank endpoint:
```shell
curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{
"model": "answerdotai/answerai-colbert-small-v1",
"query": "What is machine learning?",
"documents": [
"Machine learning is a subset of artificial intelligence.",
"Python is a programming language.",
"Deep learning uses neural networks."
]
}'
```
Or the score endpoint:
```shell
curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{
"model": "answerdotai/answerai-colbert-small-v1",
"text_1": "What is machine learning?",
"text_2": ["Machine learning is a subset of AI.", "The weather is sunny."]
}'
```
You can also get the raw token embeddings using the pooling endpoint with `token_embed` task:
```shell
curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
"model": "answerdotai/answerai-colbert-small-v1",
"input": "What is machine learning?",
"task": "token_embed"
}'
```
An example can be found here: [examples/pooling/score/colbert_rerank_online.py](../../examples/pooling/score/colbert_rerank_online.py)
### ColQwen3 Multi-Modal Late Interaction Models
ColQwen3 is based on [ColPali](https://arxiv.org/abs/2407.01449), which extends ColBERT's late interaction approach to **multi-modal** inputs. While ColBERT operates on text-only token embeddings, ColPali/ColQwen3 can embed both **text and images** (e.g. PDF pages, screenshots, diagrams) into per-token L2-normalized vectors and compute relevance via MaxSim scoring. ColQwen3 specifically uses Qwen3-VL as its vision-language backbone.
| Architecture | Backbone | Example HF Models |
| - | - | - |
| `ColQwen3` | Qwen3-VL | `TomoroAI/tomoro-colqwen3-embed-4b`, `TomoroAI/tomoro-colqwen3-embed-8b` |
| `OpsColQwen3Model` | Qwen3-VL | `OpenSearch-AI/Ops-Colqwen3-4B`, `OpenSearch-AI/Ops-Colqwen3-8B` |
| `Qwen3VLNemotronEmbedModel` | Qwen3-VL | `nvidia/nemotron-colembed-vl-4b-v2`, `nvidia/nemotron-colembed-vl-8b-v2` |
Start the server:
```shell
vllm serve TomoroAI/tomoro-colqwen3-embed-4b --max-model-len 4096
```
#### Text-only scoring and reranking
Use the `/rerank` endpoint:
```shell
curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{
"model": "TomoroAI/tomoro-colqwen3-embed-4b",
"query": "What is machine learning?",
"documents": [
"Machine learning is a subset of artificial intelligence.",
"Python is a programming language.",
"Deep learning uses neural networks."
]
}'
```
Or the `/score` endpoint:
```shell
curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{
"model": "TomoroAI/tomoro-colqwen3-embed-4b",
"text_1": "What is the capital of France?",
"text_2": ["The capital of France is Paris.", "Python is a programming language."]
}'
```
#### Multi-modal scoring and reranking (text query × image documents)
The `/score` and `/rerank` endpoints also accept multi-modal inputs directly.
Pass image documents using the `data_1`/`data_2` (for `/score`) or `documents` (for `/rerank`) fields
with a `content` list containing `image_url` and `text` parts — the same format used by the
OpenAI chat completion API:
Score a text query against image documents:
```shell
curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{
"model": "TomoroAI/tomoro-colqwen3-embed-4b",
"data_1": "Retrieve the city of Beijing",
"data_2": [
{
"content": [
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64>"}},
{"type": "text", "text": "Describe the image."}
]
}
]
}'
```
Rerank image documents by a text query:
```shell
curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{
"model": "TomoroAI/tomoro-colqwen3-embed-4b",
"query": "Retrieve the city of Beijing",
"documents": [
{
"content": [
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64_1>"}},
{"type": "text", "text": "Describe the image."}
]
},
{
"content": [
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64_2>"}},
{"type": "text", "text": "Describe the image."}
]
}
],
"top_n": 2
}'
```
#### Raw token embeddings
You can also get the raw token embeddings using the `/pooling` endpoint with `token_embed` task:
```shell
curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
"model": "TomoroAI/tomoro-colqwen3-embed-4b",
"input": "What is machine learning?",
"task": "token_embed"
}'
```
For **image inputs** via the pooling endpoint, use the chat-style `messages` field:
```shell
curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
"model": "TomoroAI/tomoro-colqwen3-embed-4b",
"messages": [
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64>"}},
{"type": "text", "text": "Describe the image."}
]
}
]
}'
```
#### Examples
- Multi-vector retrieval: [examples/pooling/token_embed/colqwen3_token_embed_online.py](../../examples/pooling/token_embed/colqwen3_token_embed_online.py)
- Reranking (text + multi-modal): [examples/pooling/score/colqwen3_rerank_online.py](../../examples/pooling/score/colqwen3_rerank_online.py)
### Llama Nemotron Multimodal
#### Embedding Model
Llama Nemotron VL Embedding models combine the bidirectional Llama embedding backbone
(from `nvidia/llama-nemotron-embed-1b-v2`) with SigLIP as the vision encoder to produce
single-vector embeddings from text and/or images.
| Architecture | Backbone | Example HF Models |
| - | - | - |
| `LlamaNemotronVLModel` | Bidirectional Llama + SigLIP | `nvidia/llama-nemotron-embed-vl-1b-v2` |
Start the server:
```shell
vllm serve nvidia/llama-nemotron-embed-vl-1b-v2 \
--trust-remote-code \
--chat-template examples/pooling/embed/template/nemotron_embed_vl.jinja
```
!!! note
The chat template bundled with this model's tokenizer is not suitable for
the embeddings API. Use the provided override template above when serving
with the `messages`-based (chat-style) embeddings endpoint.
The override template uses the message `role` to automatically prepend the
appropriate prefix: set `role` to `"query"` for queries (prepends `query: `)
or `"document"` for passages (prepends `passage: `). Any other role omits
the prefix.
Embed text queries:
```shell
curl -s http://localhost:8000/v1/embeddings -H "Content-Type: application/json" -d '{
"model": "nvidia/llama-nemotron-embed-vl-1b-v2",
"messages": [
{
"role": "query",
"content": [
{"type": "text", "text": "What is machine learning?"}
]
}
]
}'
```
Embed images via the chat-style `messages` field:
```shell
curl -s http://localhost:8000/v1/embeddings -H "Content-Type: application/json" -d '{
"model": "nvidia/llama-nemotron-embed-vl-1b-v2",
"messages": [
{
"role": "document",
"content": [
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64>"}},
{"type": "text", "text": "Describe the image."}
]
}
]
}'
```
#### Reranker Model
Llama Nemotron VL reranker models combine the same bidirectional Llama + SigLIP
backbone with a sequence-classification head for cross-encoder scoring and reranking.
| Architecture | Backbone | Example HF Models |
| - | - | - |
| `LlamaNemotronVLForSequenceClassification` | Bidirectional Llama + SigLIP | `nvidia/llama-nemotron-rerank-vl-1b-v2` |
Start the server:
```shell
vllm serve nvidia/llama-nemotron-rerank-vl-1b-v2 \
--runner pooling \
--trust-remote-code \
--chat-template examples/pooling/score/template/nemotron-vl-rerank.jinja
```
!!! note
The chat template bundled with this checkpoint's tokenizer is not suitable
for the Score/Rerank APIs. Use the provided override template when serving:
`examples/pooling/score/template/nemotron-vl-rerank.jinja`.
Score a text query against an image document:
```shell
curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{
"model": "nvidia/llama-nemotron-rerank-vl-1b-v2",
"data_1": "Find diagrams about autonomous robots",
"data_2": [
{
"content": [
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64>"}},
{"type": "text", "text": "Robotics workflow diagram."}
]
}
]
}'
```
Rerank image documents by a text query:
```shell
curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{
"model": "nvidia/llama-nemotron-rerank-vl-1b-v2",
"query": "Find diagrams about autonomous robots",
"documents": [
{
"content": [
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64_1>"}},
{"type": "text", "text": "Robotics workflow diagram."}
]
},
{
"content": [
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64_2>"}},
{"type": "text", "text": "General skyline photo."}
]
}
],
"top_n": 2
}'
```
### BAAI/bge-m3
The `BAAI/bge-m3` model comes with extra weights for sparse and colbert embeddings but unfortunately in its `config.json`
the architecture is declared as `XLMRobertaModel`, which makes `vLLM` load it as a vanilla ROBERTA model without the
extra weights. To load the full model weights, override its architecture like this:
```shell
vllm serve BAAI/bge-m3 --hf-overrides '{"architectures": ["BgeM3EmbeddingModel"]}'
```
Then you obtain the sparse embeddings like this:
```shell
curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
"model": "BAAI/bge-m3",
"task": "token_classify",
"input": ["What is BGE M3?", "Definition of BM25"]
}'
```
Due to limitations in the output schema, the output consists of a list of
token scores for each token for each input. This means that you'll have to call
`/tokenize` as well to be able to pair tokens with scores.
Refer to the tests in `tests/models/language/pooling/test_bge_m3.py` to see how
to do that.
You can obtain the colbert embeddings like this:
```shell
curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
"model": "BAAI/bge-m3",
"task": "token_embed",
"input": ["What is BGE M3?", "Definition of BM25"]
}'
```
## Deprecated Features
### Encode task
We have split the `encode` task into two more specific token-wise tasks: `token_embed` and `token_classify`:
- `token_embed` is the same as `embed`, using normalization as the activation.
- `token_classify` is the same as `classify`, by default using softmax as the activation.
Pooling models now default support all pooling, you can use it without any settings.
- Extracting hidden states prefers using `token_embed` task.
- Reward models prefers using `token_classify` task.

View File

@@ -1,260 +0,0 @@
# Pooling Models
!!! note
We currently support pooling models primarily for convenience. This is not guaranteed to provide any performance improvements over using Hugging Face Transformers or Sentence Transformers directly.
We plan to optimize pooling models in vLLM. Please comment on <https://github.com/vllm-project/vllm/issues/21796> if you have any suggestions!
## What are pooling models?
Natural Language Processing (NLP) can be primarily divided into the following two types of tasks:
- Natural Language Understanding (NLU)
- Natural Language Generation (NLG)
The generative models supported by vLLM cover a variety of task types, such as the large language models (LLMs) we are familiar with, multimodal models (VLM) that handle multimodal inputs like images, videos, and audio, speech-to-text transcription models, and real-time models that support streaming input. Their common feature is the ability to generate text. Taking it a step further, vLLM-Omni supports the generation of multimodal content, including images, videos, and audio.
As the capabilities of generative models continue to improve, the boundaries of these models are also constantly expanding. However, certain application scenarios still require specialized small language models to efficiently complete specific tasks. These models typically have the following characteristics:
- They do not require content generation.
- They only need to perform very limited functions, without requiring strong generalization, creativity, or high intelligence.
- They demand extremely low latency and may operate on cost-constrained hardware.
- Text-only models typically have fewer than 1 billion parameters, while multimodal models generally have fewer than 10 billion parameters.
Although these models are relatively small in scale, they are still based on the Transformer architecture, similar or even identical to the most advanced large language models today. Many recently released pooling models are also fine-tuned from large language models, allowing them to benefit from the continuous improvements in large models. This architecture similarity enables them to reuse much of vLLMs infrastructure. If compatible, we would be happy to help them leverage the latest features of vLLM as well.
### Sequence-wise Task and Token-wise Task
The key distinction between sequence-wise task and token-wise task lies in their output granularity: sequence-wise task produces a single result for an entire input sequence, whereas token-wise task yields a result for each individual token within the sequence.
Of course, we also have "plugin" tasks that allow users to customize input and output processors. For more information, please refer to [IO Processor Plugins](../../design/io_processor_plugins.md).
### Pooling Tasks
| Pooling Tasks | Granularity | Outputs |
|-----------------------|---------------|-------------------------------------------------|
| `classify` (see note) | Sequence-wise | probability vector of classes for each sequence |
| `embed` | Sequence-wise | vector representations for each sequence |
| `token_classify` | Token-wise | probability vector of classes for each token |
| `token_embed` | Token-wise | vector representations for each token |
!!! note
Within classification tasks, there is a specialized subcategory: Cross-encoder (aka reranker) models. These models are a subset of classification models that accept two prompts as input and output num_labels equal to 1.
### Score Types
The scoring models is designed to compute similarity scores between two input prompts. It supports three model types (aka `score_type`): `cross-encoder`, `late-interaction`, and `bi-encoder`.
| Pooling Tasks | Granularity | Outputs | Score Types | scoring function |
|-----------------------|---------------|----------------------------------------------|--------------------|--------------------------|
| `classify` (see note) | Sequence-wise | reranker score for each sequence | `cross-encoder` | linear classifier |
| `embed` | Sequence-wise | vector representations for each sequence | `bi-encoder` | cosine similarity |
| `token_classify` | Token-wise | probability vector of classes for each token | nan | nan |
| `token_embed` | Token-wise | vector representations for each token | `late-interaction` | late interaction(MaxSim) |
!!! note
Only when a classification model outputs num_labels equal to 1 can it be used as a scoring model and have its scoring API enabled.
### Pooling Usages
| Pooling Usages | Description |
|-----------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|
| Classification Usages | Predicting which predefined category, class, or label best corresponds to a given input. |
| Embedding Usages | Converts unstructured data (text, images, audio, etc.) into structured numerical vectors (embeddings). |
| Token Classification Usages | Token-wise classification |
| Token Embedding Usages | Token-wise embedding |
| Scoring Usages | Computes similarity scores between two inputs. It supports three model types (aka `score_type`): `cross-encoder`, `late-interaction`, and `bi-encoder`. |
| Reward Usages | Evaluates the quality of outputs generated by a language model, acting as a proxy for human preferences. |
We also have some special models that support multiple pooling tasks, or have specific usage scenarios, or support special inputs and outputs.
For more detailed information, please refer to the link below.
- [Classification Usages](classify.md)
- [Embedding Usages](embed.md)
- [Reward Usages](reward.md)
- [Token Classification Usages](token_classify.md)
- [Token Embedding Usages](token_embed.md)
- [Scoring Usages](scoring.md)
- [Specific Model Examples](specific_models.md)
## Offline Inference
Each pooling model in vLLM supports one or more of these tasks according to
[Pooler.get_supported_tasks][vllm.model_executor.layers.pooler.Pooler.get_supported_tasks],
enabling the corresponding APIs.
### Offline APIs corresponding to pooling tasks
| Task | APIs |
|------------------|---------------------------------------------------------------------------------------|
| `embed` | `LLM.embed(...)`, `LLM.encode(..., pooling_task="embed")`, `LLM.score(...)`(see note) |
| `classify` | `LLM.classify(...)`, `LLM.encode(..., pooling_task="classify")`, `LLM.score(...)` |
| `token_classify` | `LLM.reward(...)`, `LLM.encode(..., pooling_task="token_classify")` |
| `token_embed` | `LLM.encode(..., pooling_task="token_embed")`, `LLM.score(...)` |
| `plugin` | `LLM.encode(..., pooling_task="plugin")` |
!!! note
Only when a classification model outputs num_labels equal to 1 can it be used as a scoring model and have its scoring API enabled.
### `LLM.classify`
The [classify][vllm.LLM.classify] method outputs a probability vector for each prompt.
It is primarily designed for [classification models](classify.md).
For more information about `LLM.embed`, see [this page](classify.md#offline-inference).
### `LLM.embed`
The [embed][vllm.LLM.embed] method outputs an embedding vector for each prompt.
It is primarily designed for [embedding models](embed.md).
For more information about `LLM.embed`, see [this page](embed.md#offline-inference).
### `LLM.score`
The [score][vllm.LLM.score] method outputs similarity scores between sentence pairs.
It is primarily designed for [score models](scoring.md).
### `LLM.encode`
The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM.
Please use one of the more specific methods or set the task directly when using `LLM.encode`, refer to the [table above](#offline-apis-corresponding-to-pooling-tasks).
### Examples
```python
from vllm import LLM
llm = LLM(model="intfloat/e5-small", runner="pooling")
(output,) = llm.encode("Hello, my name is", pooling_task="embed")
data = output.outputs.data
print(f"Data: {data!r}")
```
## Online Serving
Our online Server provides endpoints that correspond to the offline APIs:
- Corresponding to `LLM.embed`:
- [Cohere Embed API](embed.md#cohere-embed-api) (`/v2/embed`)
- [Openai-compatible Embeddings API](embed.md#openai-compatible-embeddings-api) (`/v1/embeddings`)
- Corresponding to `LLM.classify`:
- [Classification API](classify.md#online-serving)(`/classify`)
- Corresponding to `LLM.score`:
- [Score API](scoring.md#score-api)(`/score`)
- [Rerank API](scoring.md#rerank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`)
- Pooling API (`/pooling`) is similar to `LLM.encode`, being applicable to all types of pooling models.
The following introduces the Pooling API. For other APIs, please refer to the link above.
### Pooling API
Our Pooling API (`/pooling`) is similar to `LLM.encode`, being applicable to all types of pooling models.
The input format is the same as [Embeddings API](embed.md#openai-compatible-embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats.
Please use one of the more specific APIs or set the task directly when using the Pooling API, refer to the [table above](#offline-apis-corresponding-to-pooling-tasks).
Code example: [examples/pooling/pooling/pooling_online.py](../../../examples/pooling/pooling/pooling_online.py)
### Examples
```python
# start a supported embeddings model server with `vllm serve`, e.g.
# vllm serve intfloat/e5-small
import requests
host = "localhost"
port = "8000"
model_name = "intfloat/e5-small"
api_url = f"http://{host}:{port}/pooling"
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
prompt = {"model": model_name, "input": prompts, "task": "embed"}
response = requests.post(api_url, json=prompt)
for output in response.json()["data"]:
data = output["data"]
print(f"Data: {data!r} (size={len(data)})")
```
## Configuration
In vLLM, pooling models implement the [VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface.
These models use a [Pooler][vllm.model_executor.layers.pooler.Pooler] to extract the final hidden states of the input
before returning them.
### Model Runner
Run a model in pooling mode via the option `--runner pooling`.
!!! tip
There is no need to set this option in the vast majority of cases as vLLM can automatically
detect the appropriate model runner via `--runner auto`.
### Model Conversion
vLLM can adapt models for various pooling tasks via the option `--convert <type>`.
If `--runner pooling` has been set (manually or automatically) but the model does not implement the
[VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface,
vLLM will attempt to automatically convert the model according to the architecture names
shown in the table below.
| Architecture | `--convert` | Supported pooling tasks |
|-------------------------------------------------|-------------|------------------------------|
| `*ForTextEncoding`, `*EmbeddingModel`, `*Model` | `embed` | `token_embed`, `embed` |
| `*ForRewardModeling`, `*RewardModel` | `embed` | `token_embed`, `embed` |
| `*For*Classification`, `*ClassificationModel` | `classify` | `token_classify`, `classify` |
!!! tip
You can explicitly set `--convert <type>` to specify how to convert the model.
### Pooler Configuration
#### Predefined models
If the [Pooler][vllm.model_executor.layers.pooler.Pooler] defined by the model accepts `pooler_config`,
you can override some of its attributes via the `--pooler-config` option.
#### Converted models
If the model has been converted via `--convert` (see above),
the pooler assigned to each task has the following attributes by default:
| Task | Pooling Type | Normalization | Softmax |
| ---------- | ------------ | ------------- | ------- |
| `embed` | `LAST` | ✅︎ | ❌ |
| `classify` | `LAST` | ❌ | ✅︎ |
When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models,
its Sentence Transformers configuration file (`modules.json`) takes priority over the model's defaults.
You can further customize this via the `--pooler-config` option,
which takes priority over both the model's and Sentence Transformers' defaults.
## Removed Features
### Encode task
We have split the `encode` task into two more specific token-wise tasks: `token_embed` and `token_classify`:
- `token_embed` is the same as `embed`, using normalization as the activation.
- `token_classify` is the same as `classify`, by default using softmax as the activation.
Pooling models now default support all pooling, you can use it without any settings.
- Extracting hidden states prefers using `token_embed` task.
- Named Entity Recognition (NER) and reward models prefers using `token_classify` task.
### Score task
`score` task is deprecated and will be removed in v0.20. Please use `classify` instead. Only when a classification model outputs num_labels equal to 1 can it be used as a scoring model and have its scoring API enabled.

View File

@@ -1,278 +0,0 @@
# Classification Usages
Classification involves predicting which predefined category, class, or label best corresponds to a given input.
## Summary
- Model Usage: (sequence) classification
- Pooling Task: `classify`
- Offline APIs:
- `LLM.classify(...)`
- `LLM.encode(..., pooling_task="classify")`
- Online APIs:
- [Classification API](classify.md#online-serving) (`/classify`)
- Pooling API (`/pooling`)
The key distinction between (sequence) classification and token classification lies in their output granularity: (sequence) classification produces a single result for an entire input sequence, whereas token classification yields a result for each individual token within the sequence.
Many classification models support both (sequence) classification and token classification. For further details on token classification, please refer to [this page](token_classify.md).
Only when a classification model outputs num_labels equal to 1 can it be used as a scoring model and have its scoring API enabled, please refer to [this page](scoring.md).
## Typical Use Cases
### Classification
The most fundamental application of classification models is to categorize input data into predefined classes.
## Supported Models
### Text-only Models
| Architecture | Models | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) |
| ------------ | ------ | ----------------- | ------------------------------ | ------------------------------------------ |
| `ErnieForSequenceClassification` | BERT-like Chinese ERNIE | `Forrest20231206/ernie-3.0-base-zh-cls` | | |
| `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | |
| `Qwen2ForSequenceClassification`<sup>C</sup> | Qwen2-based | `jason9693/Qwen2.5-1.5B-apeach` | | |
| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |
### Multimodal Models
!!! note
For more information about multimodal models inputs, see [this page](../supported_models.md#list-of-multimodal-language-models).
| Architecture | Models | Inputs | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) |
| ------------ | ------ | ------ | ----------------- | ------------------------------ | ------------------------------------------ |
| `Qwen2_5_VLForSequenceClassification`<sup>C</sup> | Qwen2_5_VL-based | T + I<sup>E+</sup> + V<sup>E+</sup> | `muziyongshixin/Qwen2.5-VL-7B-for-VideoCls` | | |
| `*ForConditionalGeneration`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | \* | N/A | \* | \* |
<sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./README.md#model-conversion))
\* Feature support is the same as that of the original model.
If your model is not in the above list, we will try to automatically convert the model using
[as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
### Cross-encoder Models
Cross-encoder (aka reranker) models are a subset of classification models that accept two prompts as input and output num_labels equal to 1. Most classification models can also be used as [cross-encoder models](scoring.md#cross-encoder-models). For more information on cross-encoder models, please refer to [this page](scoring.md).
--8<-- "docs/models/pooling_models/scoring.md:supported-cross-encoder-models"
### Reward Models
Using (sequence) classification models as reward models. For more information, see [Reward Models](reward.md).
--8<-- "docs/models/pooling_models/reward.md:supported-sequence-reward-models"
## Offline Inference
### Pooling Parameters
The following [pooling parameters][vllm.PoolingParams] are supported.
```python
--8<-- "vllm/pooling_params.py:common-pooling-params"
--8<-- "vllm/pooling_params.py:classify-pooling-params"
```
### `LLM.classify`
The [classify][vllm.LLM.classify] method outputs a probability vector for each prompt.
```python
from vllm import LLM
llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", runner="pooling")
(output,) = llm.classify("Hello, my name is")
probs = output.outputs.probs
print(f"Class Probabilities: {probs!r} (size={len(probs)})")
```
A code example can be found here: [examples/offline_inference/basic/classify.py](../../../examples/basic/offline_inference/classify.py)
### `LLM.encode`
The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM.
Set `pooling_task="classify"` when using `LLM.encode` for classification Models:
```python
from vllm import LLM
llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", runner="pooling")
(output,) = llm.encode("Hello, my name is", pooling_task="classify")
data = output.outputs.data
print(f"Data: {data!r}")
```
## Online Serving
### Classification API
Online `/classify` API is similar to `LLM.classify`.
#### Completion Parameters
The following Classification API parameters are supported:
??? code
```python
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:completion-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-params"
```
The following extra parameters are supported:
??? code
```python
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:completion-extra-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params"
```
#### Chat Parameters
For chat-like input (i.e. if `messages` is passed), the following parameters are supported:
??? code
```python
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:chat-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-params"
```
these extra parameters are supported instead:
??? code
```python
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:chat-extra-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params"
```
#### Example Requests
Code example: [examples/pooling/classify/classification_online.py](../../../examples/pooling/classify/classification_online.py)
You can classify multiple texts by passing an array of strings:
```bash
curl -v "http://127.0.0.1:8000/classify" \
-H "Content-Type: application/json" \
-d '{
"model": "jason9693/Qwen2.5-1.5B-apeach",
"input": [
"Loved the new café—coffee was great.",
"This update broke everything. Frustrating."
]
}'
```
??? console "Response"
```json
{
"id": "classify-7c87cac407b749a6935d8c7ce2a8fba2",
"object": "list",
"created": 1745383065,
"model": "jason9693/Qwen2.5-1.5B-apeach",
"data": [
{
"index": 0,
"label": "Default",
"probs": [
0.565970778465271,
0.4340292513370514
],
"num_classes": 2
},
{
"index": 1,
"label": "Spoiled",
"probs": [
0.26448777318000793,
0.7355121970176697
],
"num_classes": 2
}
],
"usage": {
"prompt_tokens": 20,
"total_tokens": 20,
"completion_tokens": 0,
"prompt_tokens_details": null
}
}
```
You can also pass a string directly to the `input` field:
```bash
curl -v "http://127.0.0.1:8000/classify" \
-H "Content-Type: application/json" \
-d '{
"model": "jason9693/Qwen2.5-1.5B-apeach",
"input": "Loved the new café—coffee was great."
}'
```
??? console "Response"
```json
{
"id": "classify-9bf17f2847b046c7b2d5495f4b4f9682",
"object": "list",
"created": 1745383213,
"model": "jason9693/Qwen2.5-1.5B-apeach",
"data": [
{
"index": 0,
"label": "Default",
"probs": [
0.565970778465271,
0.4340292513370514
],
"num_classes": 2
}
],
"usage": {
"prompt_tokens": 10,
"total_tokens": 10,
"completion_tokens": 0,
"prompt_tokens_details": null
}
}
```
## More examples
More examples can be found here: [examples/pooling/classify](../../../examples/pooling/classify)
## Supported Features
### Enable/disable activation
You can enable or disable activation via `use_activation`.
### Problem type (e.g. `multi_label_classification`)
You can modify the `problem_type` via problem_type in the Hugging Face config. The supported problem types are: `single_label_classification`, `multi_label_classification`, and `regression`.
Implement alignment with transformers [ForSequenceClassificationLoss](https://github.com/huggingface/transformers/blob/57bb6db6ee4cfaccc45b8d474dfad5a17811ca60/src/transformers/loss/loss_utils.py#L92).
### Logit bias
You can modify the `logit_bias` (aka `sigmoid_normalize`) through the logit_bias parameter in `vllm.config.PoolerConfig`.
## Removed Features
### Remove softmax from PoolingParams
We have already removed `softmax` and `activation` from PoolingParams. Instead, use `use_activation`, since we allow `classify` and `token_classify` to use any activation function.

View File

@@ -1,546 +0,0 @@
# Embedding Usages
Embedding models are a class of machine learning models designed to transform unstructured data—such as text, images, or audio—into a structured numerical representation known as an embedding.
## Summary
- Model Usage: (sequence) embedding
- Pooling Task: `embed`
- Offline APIs:
- `LLM.embed(...)`
- `LLM.encode(..., pooling_task="embed")`
- `LLM.score(...)`
- Online APIs:
- [Cohere Embed API](embed.md#cohere-embed-api) (`/v2/embed`)
- [Openai-compatible Embeddings API](embed.md#openai-compatible-embeddings-api) (`/v1/embeddings`)
- Pooling API (`/pooling`)
The primary distinction between (sequence) embedding and token embedding lies in their output granularity: (sequence) embedding produces a single embedding vector for an entire input sequence, whereas token embedding generates an embedding for each individual token within the sequence.
Many embedding models support both (sequence) embedding and token embedding. For further details on token embedding, please refer to [this page](token_embed.md).
## Typical Use Cases
### Embedding
The most basic use case of embedding models is to embed the inputs, e.g. for RAG.
### Pairwise Similarity
You can compute pairwise similarity scores to build a similarity matrix using the [Score API](scoring.md).
## Supported Models
--8<-- [start:supported-embed-models]
### Text-only Models
| Architecture | Models | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) |
| ------------ | ------ | ----------------- | ------------------------------ | ------------------------------------------ |
| `BertModel` | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | |
| `BertSpladeSparseEmbeddingModel` | SPLADE | `naver/splade-v3` | | |
| `ErnieModel` | BERT-like Chinese ERNIE | `shibing624/text2vec-base-chinese-sentence` | | |
| `Gemma2Model`<sup>C</sup> | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | ✅︎ |
| `Gemma3TextModel`<sup>C</sup> | Gemma 3-based | `google/embeddinggemma-300m`, etc. | ✅︎ | ✅︎ |
| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ |
| `GteModel` | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. | | |
| `GteNewModel` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | | |
| `LlamaBidirectionalModel`<sup>C</sup> | Llama-based with bidirectional attention | `nvidia/llama-nemotron-embed-1b-v2`, etc. | ✅︎ | ✅︎ |
| `LlamaModel`<sup>C</sup>, `LlamaForCausalLM`<sup>C</sup>, `MistralModel`<sup>C</sup>, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ |
| `ModernBertModel` | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | | |
| `NomicBertModel` | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | | |
| `Qwen2Model`<sup>C</sup>, `Qwen2ForCausalLM`<sup>C</sup> | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ |
| `Qwen3Model`<sup>C</sup>, `Qwen3ForCausalLM`<sup>C</sup> | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ |
| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | |
| `VoyageQwen3BidirectionalEmbedModel`<sup>C</sup> | Voyage Qwen3-based with bidirectional attention | `voyageai/voyage-4-nano`, etc. | ✅︎ | ✅︎ |
| `XLMRobertaModel` | XLMRobertaModel-based | `BAAI/bge-m3` (see note), `intfloat/multilingual-e5-base`, `jinaai/jina-embeddings-v3` (see note), etc. | | |
| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |
!!! note
The second-generation GTE model (mGTE-TRM) is named `NewModel`. The name `NewModel` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewModel"]}'` to specify the use of the `GteNewModel` architecture.
!!! note
`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
You need to manually set mean pooling by passing `--pooler-config '{"pooling_type": "MEAN"}'`.
!!! note
For `Alibaba-NLP/gte-Qwen2-*`, you need to enable `--trust-remote-code` for the correct tokenizer to be loaded.
See [relevant issue on HF Transformers](https://github.com/huggingface/transformers/issues/34882).
!!! note
The `BAAI/bge-m3` model comes with extra weights for sparse and colbert embeddings, See [this page](specific_models.md#baaibge-m3) for more information.
!!! note
`jinaai/jina-embeddings-v3` supports multiple tasks through LoRA, while vllm temporarily only supports text-matching tasks by merging LoRA weights.
### Multimodal Models
!!! note
For more information about multimodal models inputs, see [this page](../supported_models.md#list-of-multimodal-language-models).
| Architecture | Models | Inputs | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) |
| ------------ | ------ | ------ | ----------------- | ------------------------------ | ------------------------------------------ |
| `CLIPModel` | CLIP | T / I | `openai/clip-vit-base-patch32`, `openai/clip-vit-large-patch14`, etc. | | |
| `LlamaNemotronVLModel` | Llama Nemotron Embedding + SigLIP | T + I | `nvidia/llama-nemotron-embed-vl-1b-v2` | | |
| `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ |
| `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ |
| `Qwen3VLForConditionalGeneration`<sup>C</sup> | Qwen3-VL | T + I + V | `Qwen/Qwen3-VL-Embedding-2B`, etc. | ✅︎ | ✅︎ |
| `SiglipModel` | SigLIP, SigLIP2 | T / I | `google/siglip-base-patch16-224`, `google/siglip2-base-patch16-224` | | |
| `*ForConditionalGeneration`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | \* | N/A | \* | \* |
<sup>C</sup> Automatically converted into an embedding model via `--convert embed`. ([details](./README.md#model-conversion))
\* Feature support is the same as that of the original model.
If your model is not in the above list, we will try to automatically convert the model using
[as_embedding_model][vllm.model_executor.models.adapters.as_embedding_model]. By default, the embeddings
of the whole prompt are extracted from the normalized hidden state corresponding to the last token.
!!! note
Although vLLM supports automatically converting models of any architecture into embedding models via --convert embed, to get the best results, you should use pooling models that are specifically trained as such.
--8<-- [end:supported-embed-models]
## Offline Inference
### Pooling Parameters
The following [pooling parameters][vllm.PoolingParams] are supported.
```python
--8<-- "vllm/pooling_params.py:common-pooling-params"
--8<-- "vllm/pooling_params.py:embed-pooling-params"
```
### `LLM.embed`
The [embed][vllm.LLM.embed] method outputs an embedding vector for each prompt.
```python
from vllm import LLM
llm = LLM(model="intfloat/e5-small", runner="pooling")
(output,) = llm.embed("Hello, my name is")
embeds = output.outputs.embedding
print(f"Embeddings: {embeds!r} (size={len(embeds)})")
```
A code example can be found here: [examples/offline_inference/basic/embed.py](../../../examples/basic/offline_inference/embed.py)
### `LLM.encode`
The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM.
Set `pooling_task="embed"` when using `LLM.encode` for embedding Models:
```python
from vllm import LLM
llm = LLM(model="intfloat/e5-small", runner="pooling")
(output,) = llm.encode("Hello, my name is", pooling_task="embed")
data = output.outputs.data
print(f"Data: {data!r}")
```
### `LLM.score`
The [score][vllm.LLM.score] method outputs similarity scores between sentence pairs.
All models that support embedding task also support using the score API to compute similarity scores by calculating the cosine similarity of two input prompt's embeddings.
```python
from vllm import LLM
llm = LLM(model="intfloat/e5-small", runner="pooling")
(output,) = llm.score(
"What is the capital of France?",
"The capital of Brazil is Brasilia.",
)
score = output.outputs.score
print(f"Score: {score}")
```
## Online Serving
### OpenAI-Compatible Embeddings API
Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings);
you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
Code example: [examples/pooling/embed/openai_embedding_client.py](../../../examples/pooling/embed/openai_embedding_client.py)
#### Completion Parameters
The following Classification API parameters are supported:
??? code
```python
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:completion-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:encoding-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:embed-params"
```
The following extra parameters are supported:
??? code
```python
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:completion-extra-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:encoding-extra-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:embed-extra-params"
```
#### Chat Parameters
For chat-like input (i.e. if `messages` is passed), the following parameters are supported:
??? code
```python
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:chat-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:encoding-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:embed-params"
```
these extra parameters are supported instead:
??? code
```python
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:chat-extra-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:encoding-extra-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:embed-extra-params"
```
#### Examples
If the model has a [chat template](../../serving/openai_compatible_server.md#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat API](../../serving/openai_compatible_server.md#chat-api))
which will be treated as a single prompt to the model. Here is a convenience function for calling the API while retaining OpenAI's type annotations:
??? code
```python
from openai import OpenAI
from openai._types import NOT_GIVEN, NotGiven
from openai.types.chat import ChatCompletionMessageParam
from openai.types.create_embedding_response import CreateEmbeddingResponse
def create_chat_embeddings(
client: OpenAI,
*,
messages: list[ChatCompletionMessageParam],
model: str,
encoding_format: Union[Literal["base64", "float"], NotGiven] = NOT_GIVEN,
) -> CreateEmbeddingResponse:
return client.post(
"/embeddings",
cast_to=CreateEmbeddingResponse,
body={"messages": messages, "model": model, "encoding_format": encoding_format},
)
```
##### Multi-modal inputs
You can pass multi-modal inputs to embedding models by defining a custom chat template for the server
and passing a list of `messages` in the request. Refer to the examples below for illustration.
=== "VLM2Vec"
To serve the model:
```bash
vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling \
--trust-remote-code \
--max-model-len 4096 \
--chat-template examples/pooling/embed/template/vlm2vec_phi3v.jinja
```
!!! important
Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--runner pooling`
to run this model in embedding mode instead of text generation mode.
The custom chat template is completely different from the original one for this model,
and can be found here: [examples/pooling/embed/template/vlm2vec_phi3v.jinja](../../../examples/pooling/embed/template/vlm2vec_phi3v.jinja)
Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
??? code
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:8000/v1",
api_key="EMPTY",
)
image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
response = create_chat_embeddings(
client,
model="TIGER-Lab/VLM2Vec-Full",
messages=[
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": image_url}},
{"type": "text", "text": "Represent the given image."},
],
}
],
encoding_format="float",
)
print("Image embedding output:", response.data[0].embedding)
```
=== "DSE-Qwen2-MRL"
To serve the model:
```bash
vllm serve MrLight/dse-qwen2-2b-mrl-v1 --runner pooling \
--trust-remote-code \
--max-model-len 8192 \
--chat-template examples/pooling/embed/template/dse_qwen2_vl.jinja
```
!!! important
Like with VLM2Vec, we have to explicitly pass `--runner pooling`.
Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled
by a custom chat template: [examples/pooling/embed/template/dse_qwen2_vl.jinja](../../../examples/pooling/embed/template/dse_qwen2_vl.jinja)
!!! important
`MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
example below for details.
Full example: [examples/pooling/embed/vision_embedding_online.py](../../../examples/pooling/embed/vision_embedding_online.py)
### Cohere Embed API
Our API is also compatible with [Cohere's Embed v2 API](https://docs.cohere.com/reference/embed) which adds support for some modern embedding feature such as truncation, output dimensions, embedding types, and input types. This endpoint works with any embedding model (including multimodal models).
#### Cohere Embed API request parameters
| Parameter | Type | Required | Description |
| --------- | ---- | -------- | ----------- |
| `model` | string | Yes | Model name |
| `input_type` | string | No | Prompt prefix key (model-dependent, see below) |
| `texts` | list[string] | No | Text inputs (use one of `texts`, `images`, or `inputs`) |
| `images` | list[string] | No | Base64 data URI images |
| `inputs` | list[object] | No | Mixed text and image content objects |
| `embedding_types` | list[string] | No | Output types (default: `["float"]`) |
| `output_dimension` | int | No | Truncate embeddings to this dimension (Matryoshka) |
| `truncate` | string | No | `END`, `START`, or `NONE` (default: `END`) |
#### Text embedding
```bash
curl -X POST "http://localhost:8000/v2/embed" \
-H "Content-Type: application/json" \
-d '{
"model": "Snowflake/snowflake-arctic-embed-m-v1.5",
"input_type": "query",
"texts": ["Hello world", "How are you?"],
"embedding_types": ["float"]
}'
```
??? console "Response"
```json
{
"id": "embd-...",
"embeddings": {
"float": [
[0.012, -0.034, ...],
[0.056, 0.078, ...]
]
},
"texts": ["Hello world", "How are you?"],
"meta": {
"api_version": {"version": "2"},
"billed_units": {"input_tokens": 12}
}
}
```
#### Mixed text and image inputs
For multimodal models, you can embed images by passing base64 data URIs. The `inputs` field accepts a list of objects with mixed text and image content:
```bash
curl -X POST "http://localhost:8000/v2/embed" \
-H "Content-Type: application/json" \
-d '{
"model": "google/siglip-so400m-patch14-384",
"inputs": [
{
"content": [
{"type": "text", "text": "A photo of a cat"},
{"type": "image_url", "image_url": {"url": "data:image/png;base64,iVBOR..."}}
]
}
],
"embedding_types": ["float"]
}'
```
#### Embedding types
The `embedding_types` parameter controls the output format. Multiple types can be requested in a single call:
| Type | Description |
| ---- | ----------- |
| `float` | Raw float32 embeddings (default) |
| `binary` | Bit-packed signed binary |
| `ubinary` | Bit-packed unsigned binary |
| `base64` | Little-endian float32 encoded as base64 |
```bash
curl -X POST "http://localhost:8000/v2/embed" \
-H "Content-Type: application/json" \
-d '{
"model": "Snowflake/snowflake-arctic-embed-m-v1.5",
"input_type": "query",
"texts": ["What is machine learning?"],
"embedding_types": ["float", "binary"]
}'
```
??? console "Response"
```json
{
"id": "embd-...",
"embeddings": {
"float": [[0.012, -0.034, ...]],
"binary": [[42, -117, ...]]
},
"texts": ["What is machine learning?"],
"meta": {
"api_version": {"version": "2"},
"billed_units": {"input_tokens": 8}
}
}
```
#### Truncation
The `truncate` parameter controls how inputs exceeding the model's maximum sequence length are handled:
| Value | Behavior |
| ----- | --------- |
| `END` (default) | Keep the first tokens, drop the end |
| `START` | Keep the last tokens, drop the beginning |
| `NONE` | Return an error if the input is too long |
#### Input type and prompt prefixes
The `input_type` field selects a prompt prefix to prepend to each text input. The available values
depend on the model:
- **Models with `task_instructions` in `config.json`**: The keys from the `task_instructions` dict are
the valid `input_type` values and the corresponding value is prepended to each text.
- **Models with `config_sentence_transformers.json` prompts**: The keys from the `prompts` dict are
the valid `input_type` values. For example, `Snowflake/snowflake-arctic-embed-xs` defines `"query"`,
so setting `input_type: "query"` prepends `"Represent this sentence for searching relevant passages: "`.
- **Other models**: `input_type` is not accepted and will raise a validation error if passed.
## More examples
More examples can be found here: [examples/pooling/embed](../../../examples/pooling/embed)
## Supported Features
### Enable/disable normalize
You can enable or disable normalize via `use_activation`.
### Matryoshka Embeddings
[Matryoshka Embeddings](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings) or [Matryoshka Representation Learning (MRL)](https://arxiv.org/abs/2205.13147) is a technique used in training embedding models. It allows users to trade off between performance and cost.
!!! warning
Not all embedding models are trained using Matryoshka Representation Learning. To avoid misuse of the `dimensions` parameter, vLLM returns an error for requests that attempt to change the output dimension of models that do not support Matryoshka Embeddings.
For example, setting `dimensions` parameter while using the `BAAI/bge-m3` model will result in the following error.
```json
{"object":"error","message":"Model \"BAAI/bge-m3\" does not support matryoshka representation, changing output dimensions will lead to poor results.","type":"BadRequestError","param":null,"code":400}
```
#### Manually enable Matryoshka Embeddings
There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, if `is_matryoshka` is `True` in `config.json`, you can change the output dimension to arbitrary values. Use `matryoshka_dimensions` to control the allowed output dimensions.
For models that support Matryoshka Embeddings but are not recognized by vLLM, manually override the config using `hf_overrides={"is_matryoshka": True}` or `hf_overrides={"matryoshka_dimensions": [<allowed output dimensions>]}` (offline), or `--hf-overrides '{"is_matryoshka": true}'` or `--hf-overrides '{"matryoshka_dimensions": [<allowed output dimensions>]}'` (online).
Here is an example to serve a model with Matryoshka Embeddings enabled.
```bash
vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf-overrides '{"matryoshka_dimensions":[256]}'
```
#### Offline Inference
You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter in [PoolingParams][vllm.PoolingParams].
```python
from vllm import LLM, PoolingParams
llm = LLM(
model="jinaai/jina-embeddings-v3",
runner="pooling",
trust_remote_code=True,
)
outputs = llm.embed(
["Follow the white rabbit."],
pooling_params=PoolingParams(dimensions=32),
)
print(outputs[0].outputs)
```
A code example can be found here: [examples/pooling/embed/embed_matryoshka_fy_offline.py](../../../examples/pooling/embed/embed_matryoshka_fy_offline.py)
#### Online Inference
Use the following command to start the vLLM server.
```bash
vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
```
You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter.
```bash
curl http://127.0.0.1:8000/v1/embeddings \
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-d '{
"input": "Follow the white rabbit.",
"model": "jinaai/jina-embeddings-v3",
"encoding_format": "float",
"dimensions": 32
}'
```
Expected output:
```json
{"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}}
```
An OpenAI client example can be found here: [examples/pooling/embed/openai_embedding_matryoshka_fy_client.py](../../../examples/pooling/embed/openai_embedding_matryoshka_fy_client.py)
## Removed Features
### Remove `normalize` from PoolingParams
We have already removed `normalize` from PoolingParams, use `use_activation` instead.

View File

@@ -1,136 +0,0 @@
# Reward Usages
A reward model (RM) is designed to evaluate and score the quality of outputs generated by a language model, acting as a proxy for human preferences.
## Summary
- Model Usage: reward
- Pooling Task:
| Model Types | Pooling Tasks |
|------------------------------------|----------------|
| (sequence) (outcome) reward models | classify |
| token (outcome) reward models | token_classify |
| process reward models | token_classify |
- Offline APIs:
- `LLM.encode(..., pooling_task="...")`
- Online APIs:
- Pooling API (`/pooling`)
## Supported Models
### Reward Models
Using sequence classification models as (sequence) (outcome) reward models, the usage and supported features are the same as for normal [classification models](classify.md).
--8<-- [start:supported-sequence-reward-models]
| Architecture | Models | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) |
| ------------ | ------ | ----------------- | -------------------- | ------------------------- |
| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ |
| `Qwen3ForSequenceClassification`<sup>C</sup> | Qwen3-based | `Skywork/Skywork-Reward-V2-Qwen3-0.6B`, etc. | ✅︎ | ✅︎ |
| `LlamaForSequenceClassification`<sup>C</sup> | Llama-based | `Skywork/Skywork-Reward-V2-Llama-3.2-1B`, etc. | ✅︎ | ✅︎ |
| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |
<sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./README.md#model-conversion))
If your model is not in the above list, we will try to automatically convert the model using
[as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
--8<-- [end:supported-sequence-reward-models]
### Token Reward Models
The key distinction between (sequence) classification and token classification lies in their output granularity: (sequence) classification produces a single result for an entire input sequence, whereas token classification yields a result for each individual token within the sequence.
Using token classification models as token (outcome) reward models, the usage and supported features are the same as for normal [token classification models](token_classify.md).
--8<-- [start:supported-token-reward-models]
| Architecture | Models | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) |
| ------------ | ------ | ----------------- | -------------------- | ------------------------- |
| `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ |
| `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ |
| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |
<sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./README.md#model-conversion))
If your model is not in the above list, we will try to automatically convert the model using
[as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model].
--8<-- [end:supported-token-reward-models]
### Process Reward Models
The process reward models used for evaluating intermediate steps are crucial to achieving the desired outcome.
| Architecture | Models | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) |
| ------------ | ------ | ----------------- | -------------------- | ------------------------- |
| `LlamaForCausalLM` | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ |
| `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B`, etc. | ✅︎ | ✅︎ |
!!! important
For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
e.g.: `--pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
## Offline Inference
### Pooling Parameters
The following [pooling parameters][vllm.PoolingParams] are supported.
```python
--8<-- "vllm/pooling_params.py:common-pooling-params"
--8<-- "vllm/pooling_params.py:classify-pooling-params"
```
### `LLM.encode`
The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM.
- Reward Models
Set `pooling_task="classify"` when using `LLM.encode` for (sequence) (outcome) reward models:
```python
from vllm import LLM
llm = LLM(model="Skywork/Skywork-Reward-V2-Qwen3-0.6B", runner="pooling")
(output,) = llm.encode("Hello, my name is", pooling_task="classify")
data = output.outputs.data
print(f"Data: {data!r}")
```
- Token Reward Models
Set `pooling_task="token_classify"` when using `LLM.encode` for token (outcome) reward models:
```python
from vllm import LLM
llm = LLM(model="internlm/internlm2-1_8b-reward", runner="pooling", trust_remote_code=True)
(output,) = llm.encode("Hello, my name is", pooling_task="token_classify")
data = output.outputs.data
print(f"Data: {data!r}")
```
- Process Reward Models
Set `pooling_task="token_classify"` when using `LLM.encode` for token (outcome) reward models:
```python
from vllm import LLM
llm = LLM(model="Qwen/Qwen2.5-Math-PRM-7B", runner="pooling")
(output,) = llm.encode("Hello, my name is<extra_0><extra_0><extra_0>", pooling_task="token_classify")
data = output.outputs.data
print(f"Data: {data!r}")
```
## Online Serving
Please refer to the [pooling API](README.md#pooling-api). Pooling task corresponding to reward model types refer to the [table above](#summary).

View File

@@ -1,451 +0,0 @@
# Scoring Usages
The score models is designed to compute similarity scores between two input prompts. It supports three model types (aka `score_type`): `cross-encoder`, `late-interaction`, and `bi-encoder`.
!!! note
vLLM handles only the model inference component of RAG pipelines (such as embedding generation and reranking). For higher-level RAG orchestration, you should leverage integration frameworks like [LangChain](https://github.com/langchain-ai/langchain).
## Summary
- Model Usage: Scoring
- Pooling Task:
| Score Types | Pooling Tasks | scoring function |
|--------------------|-----------------------|--------------------------|
| `cross-encoder` | `classify` (see note) | linear classifier |
| `late-interaction` | `token_embed` | late interaction(MaxSim) |
| `bi-encoder` | `embed` | cosine similarity |
- Offline APIs:
- `LLM.score`
- Online APIs:
- [Score API](scoring.md#score-api) (`/score`)
- [Rerank API](scoring.md#rerank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`)
!!! note
Only when a classification model outputs num_labels equal to 1 can it be used as a scoring model and have its scoring API enabled.
## Supported Models
### Cross-encoder models
[Cross-encoder](https://www.sbert.net/examples/applications/cross-encoder/README.html) (aka reranker) models are a subset of classification models that accept two prompts as input and output num_labels equal to 1.
--8<-- [start:supported-cross-encoder-models]
#### Text-only Models
| Architecture | Models | Example HF Models | Score template (see note) | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) |
| ------------ | ------ | ----------------- | ------------------------- | --------------------------- | --------------------------------------- |
| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | N/A | | |
| `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma`(see note), etc. | [bge-reranker-v2-gemma.jinja](../../../examples/pooling/score/template/bge-reranker-v2-gemma.jinja) | ✅︎ | ✅︎ |
| `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. | N/A | | |
| `LlamaBidirectionalForSequenceClassification`<sup>C</sup> | Llama-based with bidirectional attention | `nvidia/llama-nemotron-rerank-1b-v2`, etc. | [nemotron-rerank.jinja](../../../examples/pooling/score/template/nemotron-rerank.jinja) | ✅︎ | ✅︎ |
| `Qwen2ForSequenceClassification`<sup>C</sup> | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2`(see note), etc. | [mxbai_rerank_v2.jinja](../../../examples/pooling/score/template/mxbai_rerank_v2.jinja) | ✅︎ | ✅︎ |
| `Qwen3ForSequenceClassification`<sup>C</sup> | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B`(see note), etc. | [qwen3_reranker.jinja](../../../examples/pooling/score/template/qwen3_reranker.jinja) | ✅︎ | ✅︎ |
| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | N/A | | |
| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | N/A | | |
| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | N/A | \* | \* |
<sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./README.md#model-conversion))
\* Feature support is the same as that of the original model.
!!! note
Some models require a specific prompt format to work correctly.
You can find Example HF Models's corresponding score template in [examples/pooling/score/template/](../../../examples/pooling/score/template)
Examples : [examples/pooling/score/using_template_offline.py](../../../examples/pooling/score/using_template_offline.py) [examples/pooling/score/using_template_online.py](../../../examples/pooling/score/using_template_online.py)
!!! note
Load the official original `BAAI/bge-reranker-v2-gemma` by using the following command.
```bash
vllm serve BAAI/bge-reranker-v2-gemma --hf_overrides '{"architectures": ["GemmaForSequenceClassification"],"classifier_from_token": ["Yes"],"method": "no_post_processing"}'
```
!!! note
The second-generation GTE model (mGTE-TRM) is named `NewForSequenceClassification`. The name `NewForSequenceClassification` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewForSequenceClassification"]}'` to specify the use of the `GteNewForSequenceClassification` architecture.
!!! note
Load the official original `mxbai-rerank-v2` by using the following command.
```bash
vllm serve mixedbread-ai/mxbai-rerank-base-v2 --hf_overrides '{"architectures": ["Qwen2ForSequenceClassification"],"classifier_from_token": ["0", "1"], "method": "from_2_way_softmax"}'
```
!!! note
Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/pooling/score/qwen3_reranker_offline.py](../../../examples/pooling/score/qwen3_reranker_offline.py) [examples/pooling/score/qwen3_reranker_online.py](../../../examples/pooling/score/qwen3_reranker_online.py).
```bash
vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
```
#### Multimodal Models
!!! note
For more information about multimodal models inputs, see [this page](../supported_models.md#list-of-multimodal-language-models).
| Architecture | Models | Inputs | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) |
| ------------ | ------ | ------ | ----------------- | ------------------------------ | ------------------------------------------ |
| `JinaVLForSequenceClassification` | JinaVL-based | T + I<sup>E+</sup> | `jinaai/jina-reranker-m0`, etc. | ✅︎ | ✅︎ |
| `LlamaNemotronVLForSequenceClassification` | Llama Nemotron Reranker + SigLIP | T + I<sup>E+</sup> | `nvidia/llama-nemotron-rerank-vl-1b-v2` | | |
| `Qwen3VLForSequenceClassification` | Qwen3-VL-Reranker | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-Reranker-2B`(see note), etc. | ✅︎ | ✅︎ |
<sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](README.md#model-conversion))
\* Feature support is the same as that of the original model.
!!! note
Similar to Qwen3-Reranker, you need to use the following `--hf_overrides` to load the official original `Qwen3-VL-Reranker`.
```bash
vllm serve Qwen/Qwen3-VL-Reranker-2B --hf_overrides '{"architectures": ["Qwen3VLForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
```
--8<-- [end:supported-cross-encoder-models]
### Late-interaction models
All models that support token embedding task also support using the score API to compute similarity scores by calculating the late interaction of two input prompts. See [this page](token_embed.md) for more information about token embedding models.
--8<-- "docs/models/pooling_models/token_embed.md:supported-token-embed-models"
### Bi-encoder
All models that support embedding task also support using the score API to compute similarity scores by calculating the cosine similarity of two input prompt's embeddings. See [this page](embed.md) for more information about embedding models.
--8<-- "docs/models/pooling_models/embed.md:supported-embed-models"
## Offline Inference
### Pooling Parameters
The following [pooling parameters][vllm.PoolingParams] are only supported by cross-encoder models and do not work for late-interaction and bi-encoder models.
```python
--8<-- "vllm/pooling_params.py:common-pooling-params"
--8<-- "vllm/pooling_params.py:classify-pooling-params"
```
### `LLM.score`
The [score][vllm.LLM.score] method outputs similarity scores between sentence pairs.
```python
from vllm import LLM
llm = LLM(model="BAAI/bge-reranker-v2-m3", runner="pooling")
(output,) = llm.score(
"What is the capital of France?",
"The capital of Brazil is Brasilia.",
)
score = output.outputs.score
print(f"Score: {score}")
```
A code example can be found here: [examples/basic/offline_inference/score.py](../../../examples/basic/offline_inference/score.py)
## Online Serving
### Score API
Our Score API (`/score`) is similar to `LLM.score`, compute similarity scores between two input prompts.
#### Parameters
The following Score API parameters are supported:
```python
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params"
```
#### Examples
##### Single inference
You can pass a string to both `queries` and `documents`, forming a single sentence pair.
```bash
curl -X 'POST' \
'http://127.0.0.1:8000/score' \
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-d '{
"model": "BAAI/bge-reranker-v2-m3",
"encoding_format": "float",
"queries": "What is the capital of France?",
"documents": "The capital of France is Paris."
}'
```
??? console "Response"
```json
{
"id": "score-request-id",
"object": "list",
"created": 693447,
"model": "BAAI/bge-reranker-v2-m3",
"data": [
{
"index": 0,
"object": "score",
"score": 1
}
],
"usage": {}
}
```
##### Batch inference
You can pass a string to `queries` and a list to `documents`, forming multiple sentence pairs
where each pair is built from `queries` and a string in `documents`.
The total number of pairs is `len(documents)`.
??? console "Request"
```bash
curl -X 'POST' \
'http://127.0.0.1:8000/score' \
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-d '{
"model": "BAAI/bge-reranker-v2-m3",
"queries": "What is the capital of France?",
"documents": [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris."
]
}'
```
??? console "Response"
```json
{
"id": "score-request-id",
"object": "list",
"created": 693570,
"model": "BAAI/bge-reranker-v2-m3",
"data": [
{
"index": 0,
"object": "score",
"score": 0.001094818115234375
},
{
"index": 1,
"object": "score",
"score": 1
}
],
"usage": {}
}
```
You can pass a list to both `queries` and `documents`, forming multiple sentence pairs
where each pair is built from a string in `queries` and the corresponding string in `documents` (similar to `zip()`).
The total number of pairs is `len(documents)`.
??? console "Request"
```bash
curl -X 'POST' \
'http://127.0.0.1:8000/score' \
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-d '{
"model": "BAAI/bge-reranker-v2-m3",
"encoding_format": "float",
"queries": [
"What is the capital of Brazil?",
"What is the capital of France?"
],
"documents": [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris."
]
}'
```
??? console "Response"
```json
{
"id": "score-request-id",
"object": "list",
"created": 693447,
"model": "BAAI/bge-reranker-v2-m3",
"data": [
{
"index": 0,
"object": "score",
"score": 1
},
{
"index": 1,
"object": "score",
"score": 1
}
],
"usage": {}
}
```
##### Multi-modal inputs
You can pass multi-modal inputs to scoring models by passing `content` including a list of multi-modal input (image, etc.) in the request. Refer to the examples below for illustration.
=== "JinaVL-Reranker"
To serve the model:
```bash
vllm serve jinaai/jina-reranker-m0
```
Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
??? Code
```python
import requests
response = requests.post(
"http://localhost:8000/v1/score",
json={
"model": "jinaai/jina-reranker-m0",
"queries": "slm markdown",
"documents": [
{
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
},
}
],
},
{
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
},
}
]
},
],
},
)
response.raise_for_status()
response_json = response.json()
print("Scoring output:", response_json["data"][0]["score"])
print("Scoring output:", response_json["data"][1]["score"])
```
Full example:
- [examples/pooling/score/vision_score_api_online.py](../../../examples/pooling/score/vision_score_api_online.py)
- [examples/pooling/score/vision_rerank_api_online.py](../../../examples/pooling/score/vision_rerank_api_online.py)
### Rerank API
`/rerank`, `/v1/rerank`, and `/v2/rerank` APIs are compatible with both [Jina AI's rerank API interface](https://jina.ai/reranker/) and
[Cohere's rerank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with
popular open-source tools.
Code example: [examples/pooling/score/rerank_api_online.py](../../../examples/pooling/score/rerank_api_online.py)
#### Parameters
The following rerank api parameters are supported:
```python
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params"
```
#### Examples
Note that the `top_n` request parameter is optional and will default to the length of the `documents` field.
Result documents will be sorted by relevance, and the `index` property can be used to determine original order.
??? console "Request"
```bash
curl -X 'POST' \
'http://127.0.0.1:8000/v1/rerank' \
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-d '{
"model": "BAAI/bge-reranker-base",
"query": "What is the capital of France?",
"documents": [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
"Horses and cows are both animals"
]
}'
```
??? console "Response"
```json
{
"id": "rerank-fae51b2b664d4ed38f5969b612edff77",
"model": "BAAI/bge-reranker-base",
"usage": {
"total_tokens": 56
},
"results": [
{
"index": 1,
"document": {
"text": "The capital of France is Paris."
},
"relevance_score": 0.99853515625
},
{
"index": 0,
"document": {
"text": "The capital of Brazil is Brasilia."
},
"relevance_score": 0.0005860328674316406
}
]
}
```
## More examples
More examples can be found here: [examples/pooling/score](../../../examples/pooling/score)
## Supported Features
AS cross-encoder models are a subset of classification models that accept two prompts as input and output num_labels equal to 1, cross-encoder features should be consistent with (sequence) classification. For more information, see [this page](classify.md#supported-features).
### Score Template
Score templates are supported for **cross-encoder** models only. If you are using an **embedding** model for scoring, vLLM does not apply a score template.
Some scoring models require a specific prompt format to work correctly. You can specify a custom score template using the `--chat-template` parameter (see [Chat Template](../../serving/openai_compatible_server.md#chat-template)).
Like chat templates, the score template receives a `messages` list. For scoring, each message has a `role` attribute—either `"query"` or `"document"`. For the usual kind of point-wise cross-encoder, you can expect exactly two messages: one query and one document. To access the query and document content, use Jinja's `selectattr` filter:
- **Query**: `{{ (messages | selectattr("role", "eq", "query") | first).content }}`
- **Document**: `{{ (messages | selectattr("role", "eq", "document") | first).content }}`
This approach is more robust than index-based access (`messages[0]`, `messages[1]`) because it selects messages by their semantic role. It also avoids assumptions about message ordering if additional message types are added to `messages` in the future.
Example template file: [examples/pooling/score/template/nemotron-rerank.jinja](../../../examples/pooling/score/template/nemotron-rerank.jinja)
### Enable/disable activation
You can enable or disable activation via `use_activation` only works for cross-encoder models.

View File

@@ -1,400 +0,0 @@
# Specific Model Examples
## ColBERT Late Interaction Models
[ColBERT](https://arxiv.org/abs/2004.12832) (Contextualized Late Interaction over BERT) is a retrieval model that uses per-token embeddings and MaxSim scoring for document ranking. Unlike single-vector embedding models, ColBERT retains token-level representations and computes relevance scores through late interaction, providing better accuracy while being more efficient than cross-encoders.
vLLM supports ColBERT models with multiple encoder backbones:
| Architecture | Backbone | Example HF Models |
| - | - | - |
| `HF_ColBERT` | BERT | `answerdotai/answerai-colbert-small-v1`, `colbert-ir/colbertv2.0` |
| `ColBERTModernBertModel` | ModernBERT | `lightonai/GTE-ModernColBERT-v1` |
| `ColBERTJinaRobertaModel` | Jina XLM-RoBERTa | `jinaai/jina-colbert-v2` |
| `ColBERTLfm2Model` | LFM2 | `LiquidAI/LFM2-ColBERT-350M` |
**BERT-based ColBERT** models work out of the box:
```shell
vllm serve answerdotai/answerai-colbert-small-v1
```
For **non-BERT backbones**, use `--hf-overrides` to set the correct architecture:
```shell
# ModernBERT backbone
vllm serve lightonai/GTE-ModernColBERT-v1 \
--hf-overrides '{"architectures": ["ColBERTModernBertModel"]}'
# Jina XLM-RoBERTa backbone
vllm serve jinaai/jina-colbert-v2 \
--hf-overrides '{"architectures": ["ColBERTJinaRobertaModel"]}' \
--trust-remote-code
# LFM2 backbone
vllm serve LiquidAI/LFM2-ColBERT-350M \
--hf-overrides '{"architectures": ["ColBERTLfm2Model"]}'
```
Then you can use the rerank API:
```shell
curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{
"model": "answerdotai/answerai-colbert-small-v1",
"query": "What is machine learning?",
"documents": [
"Machine learning is a subset of artificial intelligence.",
"Python is a programming language.",
"Deep learning uses neural networks."
]
}'
```
Or the score API:
```shell
curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{
"model": "answerdotai/answerai-colbert-small-v1",
"text_1": "What is machine learning?",
"text_2": ["Machine learning is a subset of AI.", "The weather is sunny."]
}'
```
You can also get the raw token embeddings using the pooling API with `token_embed` task:
```shell
curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
"model": "answerdotai/answerai-colbert-small-v1",
"input": "What is machine learning?",
"task": "token_embed"
}'
```
An example can be found here: [examples/pooling/score/colbert_rerank_online.py](../../../examples/pooling/score/colbert_rerank_online.py)
## ColQwen3 Multi-Modal Late Interaction Models
ColQwen3 is based on [ColPali](https://arxiv.org/abs/2407.01449), which extends ColBERT's late interaction approach to **multi-modal** inputs. While ColBERT operates on text-only token embeddings, ColPali/ColQwen3 can embed both **text and images** (e.g. PDF pages, screenshots, diagrams) into per-token L2-normalized vectors and compute relevance via MaxSim scoring. ColQwen3 specifically uses Qwen3-VL as its vision-language backbone.
| Architecture | Backbone | Example HF Models |
| - | - | - |
| `ColQwen3` | Qwen3-VL | `TomoroAI/tomoro-colqwen3-embed-4b`, `TomoroAI/tomoro-colqwen3-embed-8b` |
| `OpsColQwen3Model` | Qwen3-VL | `OpenSearch-AI/Ops-Colqwen3-4B`, `OpenSearch-AI/Ops-Colqwen3-8B` |
| `Qwen3VLNemotronEmbedModel` | Qwen3-VL | `nvidia/nemotron-colembed-vl-4b-v2`, `nvidia/nemotron-colembed-vl-8b-v2` |
Start the server:
```shell
vllm serve TomoroAI/tomoro-colqwen3-embed-4b --max-model-len 4096
```
### Text-only scoring and reranking
Use the `/rerank` API:
```shell
curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{
"model": "TomoroAI/tomoro-colqwen3-embed-4b",
"query": "What is machine learning?",
"documents": [
"Machine learning is a subset of artificial intelligence.",
"Python is a programming language.",
"Deep learning uses neural networks."
]
}'
```
Or the `/score` API:
```shell
curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{
"model": "TomoroAI/tomoro-colqwen3-embed-4b",
"text_1": "What is the capital of France?",
"text_2": ["The capital of France is Paris.", "Python is a programming language."]
}'
```
### Multi-modal scoring and reranking (text query × image documents)
The `/score` and `/rerank` APIs also accept multi-modal inputs directly.
Pass image documents using the `data_1`/`data_2` (for `/score`) or `documents` (for `/rerank`) fields
with a `content` list containing `image_url` and `text` parts — the same format used by the
OpenAI chat completion API:
Score a text query against image documents:
```shell
curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{
"model": "TomoroAI/tomoro-colqwen3-embed-4b",
"data_1": "Retrieve the city of Beijing",
"data_2": [
{
"content": [
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64>"}},
{"type": "text", "text": "Describe the image."}
]
}
]
}'
```
Rerank image documents by a text query:
```shell
curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{
"model": "TomoroAI/tomoro-colqwen3-embed-4b",
"query": "Retrieve the city of Beijing",
"documents": [
{
"content": [
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64_1>"}},
{"type": "text", "text": "Describe the image."}
]
},
{
"content": [
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64_2>"}},
{"type": "text", "text": "Describe the image."}
]
}
],
"top_n": 2
}'
```
### Raw token embeddings
You can also get the raw token embeddings using the `/pooling` API with `token_embed` task:
```shell
curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
"model": "TomoroAI/tomoro-colqwen3-embed-4b",
"input": "What is machine learning?",
"task": "token_embed"
}'
```
For **image inputs** via the pooling API, use the chat-style `messages` field:
```shell
curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
"model": "TomoroAI/tomoro-colqwen3-embed-4b",
"messages": [
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64>"}},
{"type": "text", "text": "Describe the image."}
]
}
]
}'
```
### Examples
- Multi-vector retrieval: [examples/pooling/token_embed/colqwen3_token_embed_online.py](../../../examples/pooling/token_embed/colqwen3_token_embed_online.py)
- Reranking (text + multi-modal): [examples/pooling/score/colqwen3_rerank_online.py](../../../examples/pooling/score/colqwen3_rerank_online.py)
## ColQwen3.5 Multi-Modal Late Interaction Models
ColQwen3.5 is based on [ColPali](https://arxiv.org/abs/2407.01449), extending ColBERT's late interaction approach to **multi-modal** inputs. It uses the Qwen3.5 hybrid backbone (linear + full attention) and produces per-token L2-normalized vectors for MaxSim scoring.
| Architecture | Backbone | Example HF Models |
| - | - | - |
| `ColQwen3_5` | Qwen3.5 | `athrael-soju/colqwen3.5-4.5B` |
Start the server:
```shell
vllm serve athrael-soju/colqwen3.5-4.5B --max-model-len 4096
```
Then you can use the rerank endpoint:
```shell
curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{
"model": "athrael-soju/colqwen3.5-4.5B",
"query": "What is machine learning?",
"documents": [
"Machine learning is a subset of artificial intelligence.",
"Python is a programming language.",
"Deep learning uses neural networks."
]
}'
```
Or the score endpoint:
```shell
curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{
"model": "athrael-soju/colqwen3.5-4.5B",
"text_1": "What is the capital of France?",
"text_2": ["The capital of France is Paris.", "Python is a programming language."]
}'
```
An example can be found here: [examples/pooling/score/colqwen3_5_rerank_online.py](../../../examples/pooling/score/colqwen3_5_rerank_online.py)
## Llama Nemotron Multimodal
### Embedding Model
Llama Nemotron VL Embedding models combine the bidirectional Llama embedding backbone
(from `nvidia/llama-nemotron-embed-1b-v2`) with SigLIP as the vision encoder to produce
single-vector embeddings from text and/or images.
| Architecture | Backbone | Example HF Models |
| - | - | - |
| `LlamaNemotronVLModel` | Bidirectional Llama + SigLIP | `nvidia/llama-nemotron-embed-vl-1b-v2` |
Start the server:
```shell
vllm serve nvidia/llama-nemotron-embed-vl-1b-v2 \
--trust-remote-code \
--chat-template examples/pooling/embed/template/nemotron_embed_vl.jinja
```
!!! note
The chat template bundled with this model's tokenizer is not suitable for
the embeddings API. Use the provided override template above when serving
with the `messages`-based (chat-style) embeddings API.
The override template uses the message `role` to automatically prepend the
appropriate prefix: set `role` to `"query"` for queries (prepends `query: `)
or `"document"` for passages (prepends `passage: `). Any other role omits
the prefix.
Embed text queries:
```shell
curl -s http://localhost:8000/v1/embeddings -H "Content-Type: application/json" -d '{
"model": "nvidia/llama-nemotron-embed-vl-1b-v2",
"messages": [
{
"role": "query",
"content": [
{"type": "text", "text": "What is machine learning?"}
]
}
]
}'
```
Embed images via the chat-style `messages` field:
```shell
curl -s http://localhost:8000/v1/embeddings -H "Content-Type: application/json" -d '{
"model": "nvidia/llama-nemotron-embed-vl-1b-v2",
"messages": [
{
"role": "document",
"content": [
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64>"}},
{"type": "text", "text": "Describe the image."}
]
}
]
}'
```
### Reranker Model
Llama Nemotron VL reranker models combine the same bidirectional Llama + SigLIP
backbone with a sequence-classification head for cross-encoder scoring and reranking.
| Architecture | Backbone | Example HF Models |
| - | - | - |
| `LlamaNemotronVLForSequenceClassification` | Bidirectional Llama + SigLIP | `nvidia/llama-nemotron-rerank-vl-1b-v2` |
Start the server:
```shell
vllm serve nvidia/llama-nemotron-rerank-vl-1b-v2 \
--runner pooling \
--trust-remote-code \
--chat-template examples/pooling/score/template/nemotron-vl-rerank.jinja
```
!!! note
The chat template bundled with this checkpoint's tokenizer is not suitable
for the Score/Rerank APIs. Use the provided override template when serving:
`examples/pooling/score/template/nemotron-vl-rerank.jinja`.
Score a text query against an image document:
```shell
curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{
"model": "nvidia/llama-nemotron-rerank-vl-1b-v2",
"data_1": "Find diagrams about autonomous robots",
"data_2": [
{
"content": [
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64>"}},
{"type": "text", "text": "Robotics workflow diagram."}
]
}
]
}'
```
Rerank image documents by a text query:
```shell
curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{
"model": "nvidia/llama-nemotron-rerank-vl-1b-v2",
"query": "Find diagrams about autonomous robots",
"documents": [
{
"content": [
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64_1>"}},
{"type": "text", "text": "Robotics workflow diagram."}
]
},
{
"content": [
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64_2>"}},
{"type": "text", "text": "General skyline photo."}
]
}
],
"top_n": 2
}'
```
## BAAI/bge-m3
The `BAAI/bge-m3` model comes with extra weights for sparse and colbert embeddings but unfortunately in its `config.json`
the architecture is declared as `XLMRobertaModel`, which makes `vLLM` load it as a vanilla ROBERTA model without the
extra weights. To load the full model weights, override its architecture like this:
```shell
vllm serve BAAI/bge-m3 --hf-overrides '{"architectures": ["BgeM3EmbeddingModel"]}'
```
Then you obtain the sparse embeddings like this:
```shell
curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
"model": "BAAI/bge-m3",
"task": "token_classify",
"input": ["What is BGE M3?", "Definition of BM25"]
}'
```
Due to limitations in the output schema, the output consists of a list of
token scores for each token for each input. This means that you'll have to call
`/tokenize` as well to be able to pair tokens with scores.
Refer to the tests in `tests/models/language/pooling/test_bge_m3.py` to see how
to do that.
You can obtain the colbert embeddings like this:
```shell
curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
"model": "BAAI/bge-m3",
"task": "token_embed",
"input": ["What is BGE M3?", "Definition of BM25"]
}'
```

View File

@@ -1,89 +0,0 @@
# Token Classification Usages
## Summary
- Model Usage: token classification
- Pooling Tasks: `token_classify`
- Offline APIs:
- `LLM.encode(..., pooling_task="token_classify")`
- Online APIs:
- Pooling API (`/pooling`)
The key distinction between (sequence) classification and token classification lies in their output granularity: (sequence) classification produces a single result for an entire input sequence, whereas token classification yields a result for each individual token within the sequence.
Many classification models support both (sequence) classification and token classification. For further details on (sequence) classification, please refer to [this page](classify.md).
## Typical Use Cases
### Named Entity Recognition (NER)
For implementation examples, see:
Offline: [examples/pooling/token_classify/ner_offline.py](../../../examples/pooling/token_classify/ner_offline.py)
Online: [examples/pooling/token_classify/ner_online.py](../../../examples/pooling/token_classify/ner_online.py)
### Sparse retrieval (lexical matching)
The BAAI/bge-m3 model leverages token classification for sparse retrieval. For more information, see [this page](specific_models.md#baaibge-m3).
## Supported Models
| Architecture | Models | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) |
| ------------ | ------ | ----------------- | --------------------------- | --------------------------------------- |
| `BertForTokenClassification` | bert-based | `boltuix/NeuroBERT-NER` (see note), etc. | | |
| `ErnieForTokenClassification` | BERT-like Chinese ERNIE | `gyr66/Ernie-3.0-base-chinese-finetuned-ner` | | |
| `ModernBertForTokenClassification` | ModernBERT-based | `disham993/electrical-ner-ModernBERT-base` | | |
| `Qwen3ForTokenClassification`<sup>C</sup> | Qwen3-based | `bd2lcco/Qwen3-0.6B-finetuned` | | |
| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |
<sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./README.md#model-conversion))
\* Feature support is the same as that of the original model.
If your model is not in the above list, we will try to automatically convert the model using
[as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
### As Reward Models
Using token classification models as reward models. For details on reward models, see [Reward Models](reward.md).
--8<-- "docs/models/pooling_models/reward.md:supported-token-reward-models"
## Offline Inference
### Pooling Parameters
The following [pooling parameters][vllm.PoolingParams] are supported.
```python
--8<-- "vllm/pooling_params.py:common-pooling-params"
--8<-- "vllm/pooling_params.py:classify-pooling-params"
```
### `LLM.encode`
The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM.
Set `pooling_task="token_classify"` when using `LLM.encode` for token classification Models:
```python
from vllm import LLM
llm = LLM(model="boltuix/NeuroBERT-NER", runner="pooling")
(output,) = llm.encode("Hello, my name is", pooling_task="token_classify")
data = output.outputs.data
print(f"Data: {data!r}")
```
## Online Serving
Please refer to the [pooling API](README.md#pooling-api) and use `"task":"token_classify"`.
## More examples
More examples can be found here: [examples/pooling/token_classify](../../../examples/pooling/token_classify)
## Supported Features
Token classification features should be consistent with (sequence) classification. For more information, see [this page](classify.md#supported-features).

View File

@@ -1,126 +0,0 @@
# Token Embedding Usages
## Summary
- Model Usage: Token classification models
- Pooling Tasks: `token_embed`
- Offline APIs:
- `LLM.encode(..., pooling_task="token_embed")`
- Online APIs:
- Pooling API (`/pooling`)
The difference between the (sequence) embedding task and the token embedding task is that (sequence) embedding outputs one embedding for each sequence, while token embedding outputs a embedding for each token.
Many embedding models support both (sequence) embedding and token embedding. For further details on (sequence) embedding, please refer to [this page](embed.md).
## Typical Use Cases
### Multi-Vector Retrieval
For implementation examples, see:
Offline: [examples/pooling/token_embed/multi_vector_retrieval_offline.py](../../../examples/pooling/token_embed/multi_vector_retrieval_offline.py)
Online: [examples/pooling/token_embed/multi_vector_retrieval_online.py](../../../examples/pooling/token_embed/multi_vector_retrieval_online.py)
### Late interaction
Similarity scores can be computed using late interaction between two input prompts via the score API. For more information, see [Score API](scoring.md).
### Extract last hidden states
Models of any architecture can be converted into embedding models using `--convert embed`. Token embedding can then be used to extract the last hidden states from these models.
## Supported Models
--8<-- [start:supported-token-embed-models]
### Text-only Models
| Architecture | Models | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) |
| ------------ | ------ | ----------------- | -------------------- | ------------------------- |
| `ColBERTLfm2Model` | LFM2 | `LiquidAI/LFM2-ColBERT-350M` | | |
| `ColBERTModernBertModel` | ModernBERT | `lightonai/GTE-ModernColBERT-v1` | | |
| `ColBERTJinaRobertaModel` | Jina XLM-RoBERTa | `jinaai/jina-colbert-v2` | | |
| `HF_ColBERT` | BERT | `answerdotai/answerai-colbert-small-v1`, `colbert-ir/colbertv2.0` | | |
| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |
### Multimodal Models
!!! note
For more information about multimodal models inputs, see [this page](../supported_models.md#list-of-multimodal-language-models).
| Architecture | Models | Inputs | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) |
| ------------ | ------ | ----- | ----------------- | ------------------------------ | ------------------------------------------ |
| `ColModernVBertForRetrieval` | ColModernVBERT | T / I | `ModernVBERT/colmodernvbert-merged` | | |
| `ColPaliForRetrieval` | ColPali | T / I | `vidore/colpali-v1.3-hf` | | |
| `ColQwen3` | Qwen3-VL | T / I | `TomoroAI/tomoro-colqwen3-embed-4b`, `TomoroAI/tomoro-colqwen3-embed-8b` | | |
| `ColQwen3_5` | ColQwen3.5 | T + I + V | `athrael-soju/colqwen3.5-4.5B-v3` | | |
| `OpsColQwen3Model` | Qwen3-VL | T / I | `OpenSearch-AI/Ops-Colqwen3-4B`, `OpenSearch-AI/Ops-Colqwen3-8B` | | |
| `Qwen3VLNemotronEmbedModel` | Qwen3-VL | T / I | `nvidia/nemotron-colembed-vl-4b-v2`, `nvidia/nemotron-colembed-vl-8b-v2` | ✅︎ | ✅︎ |
| `*ForConditionalGeneration`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | \* | N/A | \* | \* |
<sup>C</sup> Automatically converted into an embedding model via `--convert embed`. ([details](./README.md#model-conversion))
\* Feature support is the same as that of the original model.
If your model is not in the above list, we will try to automatically convert the model using [as_embedding_model][vllm.model_executor.models.adapters.as_embedding_model].
--8<-- [end:supported-token-embed-models]
## Offline Inference
### Pooling Parameters
The following [pooling parameters][vllm.PoolingParams] are supported.
```python
--8<-- "vllm/pooling_params.py:common-pooling-params"
--8<-- "vllm/pooling_params.py:embed-pooling-params"
```
### `LLM.encode`
The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM.
Set `pooling_task="token_embed"` when using `LLM.encode` for token embedding Models:
```python
from vllm import LLM
llm = LLM(model="answerdotai/answerai-colbert-small-v1", runner="pooling")
(output,) = llm.encode("Hello, my name is", pooling_task="token_embed")
data = output.outputs.data
print(f"Data: {data!r}")
```
### `LLM.score`
The [score][vllm.LLM.score] method outputs similarity scores between sentence pairs.
All models that support token embedding task also support using the score API to compute similarity scores by calculating the late interaction of two input prompts.
```python
from vllm import LLM
llm = LLM(model="answerdotai/answerai-colbert-small-v1", runner="pooling")
(output,) = llm.score(
"What is the capital of France?",
"The capital of Brazil is Brasilia.",
)
score = output.outputs.score
print(f"Score: {score}")
```
## Online Serving
Please refer to the [pooling API](README.md#pooling-api) and use `"task":"token_embed"`.
## More examples
More examples can be found here: [examples/pooling/token_embed](../../../examples/pooling/token_embed)
## Supported Features
Token embedding features should be consistent with (sequence) embedding. For more information, see [this page](embed.md#supported-features).

View File

@@ -1,6 +1,6 @@
# Supported Models
vLLM supports [generative](./generative_models.md) and [pooling](./pooling_models/README.md) models across various tasks.
vLLM supports [generative](./generative_models.md) and [pooling](./pooling_models.md) models across various tasks.
For each task, we list the model architectures that have been implemented in vLLM.
Alongside each architecture, we include some popular models that use it.
@@ -499,6 +499,156 @@ Some models are supported only via the [Transformers modeling backend](#transfor
!!! note
Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
### Pooling Models
See [this page](./pooling_models.md) for more information on how to use pooling models.
!!! important
Since some model architectures support both generative and pooling tasks,
you should explicitly specify `--runner pooling` to ensure that the model is used in pooling mode instead of generative mode.
#### Embedding
These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) API.
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
| ------------ | ------ | ----------------- | -------------------- | ------------------------- |
| `BertModel`<sup>C</sup> | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | |
| `BertSpladeSparseEmbeddingModel` | SPLADE | `naver/splade-v3` | | |
| `ErnieModel` | BERT-like Chinese ERNIE | `shibing624/text2vec-base-chinese-sentence` | | |
| `Gemma2Model`<sup>C</sup> | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | ✅︎ |
| `Gemma3TextModel`<sup>C</sup> | Gemma 3-based | `google/embeddinggemma-300m`, etc. | ✅︎ | ✅︎ |
| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ |
| `GteModel`<sup>C</sup> | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. | | |
| `GteNewModel`<sup>C</sup> | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | | |
| `ModernBertModel`<sup>C</sup> | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | | |
| `NomicBertModel`<sup>C</sup> | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | | |
| `LlamaBidirectionalModel`<sup>C</sup> | Llama-based with bidirectional attention | `nvidia/llama-nemotron-embed-1b-v2`, etc. | ✅︎ | ✅︎ |
| `LlamaModel`<sup>C</sup>, `LlamaForCausalLM`<sup>C</sup>, `MistralModel`<sup>C</sup>, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ |
| `Qwen2Model`<sup>C</sup>, `Qwen2ForCausalLM`<sup>C</sup> | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ |
| `Qwen3Model`<sup>C</sup>, `Qwen3ForCausalLM`<sup>C</sup> | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ |
| `VoyageQwen3BidirectionalEmbedModel`<sup>C</sup> | Voyage Qwen3-based with bidirectional attention | `voyageai/voyage-4-nano`, etc. | ✅︎ | ✅︎ |
| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | |
| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |
<sup>C</sup> Automatically converted into an embedding model via `--convert embed`. ([details](./pooling_models.md#model-conversion))
\* Feature support is the same as that of the original model.
!!! note
`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
You need to manually set mean pooling by passing `--pooler-config '{"pooling_type": "MEAN"}'`.
!!! note
For `Alibaba-NLP/gte-Qwen2-*`, you need to enable `--trust-remote-code` for the correct tokenizer to be loaded.
See [relevant issue on HF Transformers](https://github.com/huggingface/transformers/issues/34882).
!!! note
`jinaai/jina-embeddings-v3` supports multiple tasks through LoRA, while vllm temporarily only supports text-matching tasks by merging LoRA weights.
!!! note
The second-generation GTE model (mGTE-TRM) is named `NewModel`. The name `NewModel` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewModel"]}'` to specify the use of the `GteNewModel` architecture.
If your model is not in the above list, we will try to automatically convert the model using
[as_embedding_model][vllm.model_executor.models.adapters.as_embedding_model]. By default, the embeddings
of the whole prompt are extracted from the normalized hidden state corresponding to the last token.
#### Classification
These models primarily support the [`LLM.classify`](./pooling_models.md#llmclassify) API.
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
| ------------ | ------ | ----------------- | -------------------- | ------------------------- |
| `ErnieForSequenceClassification` | BERT-like Chinese ERNIE | `Forrest20231206/ernie-3.0-base-zh-cls` | | |
| `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | |
| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ |
| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |
<sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion))
\* Feature support is the same as that of the original model.
If your model is not in the above list, we will try to automatically convert the model using
[as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
#### Cross-encoder / Reranker
Cross-encoder and reranker models are a subset of classification models that accept two prompts as input.
These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) API.
| Architecture | Models | Example HF Models | Score template (see note) | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
| ------------ | ------ | ----------------- | ------------------------- | --------------------------- | --------------------------------------- |
| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | N/A | | |
| `ErnieForSequenceClassification` | BERT-like Chinese ERNIE | `Forrest20231206/ernie-3.0-base-zh-cls` | N/A | | |
| `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma`(see note), etc. | [bge-reranker-v2-gemma.jinja](../../examples/pooling/score/template/bge-reranker-v2-gemma.jinja) | ✅︎ | ✅︎ |
| `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. | N/A | | |
| `LlamaBidirectionalForSequenceClassification`<sup>C</sup> | Llama-based with bidirectional attention | `nvidia/llama-nemotron-rerank-1b-v2`, etc. | [nemotron-rerank.jinja](../../examples/pooling/score/template/nemotron-rerank.jinja) | ✅︎ | ✅︎ |
| `Qwen2ForSequenceClassification`<sup>C</sup> | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2`(see note), etc. | [mxbai_rerank_v2.jinja](../../examples/pooling/score/template/mxbai_rerank_v2.jinja) | ✅︎ | ✅︎ |
| `Qwen3ForSequenceClassification`<sup>C</sup> | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B`(see note), etc. | [qwen3_reranker.jinja](../../examples/pooling/score/template/qwen3_reranker.jinja) | ✅︎ | ✅︎ |
| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | N/A | | |
| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | N/A | | |
| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | N/A | \* | \* |
<sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion))
\* Feature support is the same as that of the original model.
!!! note
Some models require a specific prompt format to work correctly.
You can find Example HF Models's corresponding score template in [examples/pooling/score/template/](../../examples/pooling/score/template)
Examples : [examples/pooling/score/using_template_offline.py](../../examples/pooling/score/using_template_offline.py) [examples/pooling/score/using_template_online.py](../../examples/pooling/score/using_template_online.py)
!!! note
Load the official original `BAAI/bge-reranker-v2-gemma` by using the following command.
```bash
vllm serve BAAI/bge-reranker-v2-gemma --hf_overrides '{"architectures": ["GemmaForSequenceClassification"],"classifier_from_token": ["Yes"],"method": "no_post_processing"}'
```
!!! note
The second-generation GTE model (mGTE-TRM) is named `NewForSequenceClassification`. The name `NewForSequenceClassification` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewForSequenceClassification"]}'` to specify the use of the `GteNewForSequenceClassification` architecture.
!!! note
Load the official original `mxbai-rerank-v2` by using the following command.
```bash
vllm serve mixedbread-ai/mxbai-rerank-base-v2 --hf_overrides '{"architectures": ["Qwen2ForSequenceClassification"],"classifier_from_token": ["0", "1"], "method": "from_2_way_softmax"}'
```
!!! note
Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/pooling/score/qwen3_reranker_offline.py](../../examples/pooling/score/qwen3_reranker_offline.py) [examples/pooling/score/qwen3_reranker_online.py](../../examples/pooling/score/qwen3_reranker_online.py).
```bash
vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
```
#### Reward Modeling
These models primarily support the [`LLM.reward`](./pooling_models.md#llmreward) API.
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
| ------------ | ------ | ----------------- | -------------------- | ------------------------- |
| `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ |
| `LlamaForCausalLM` | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ |
| `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ |
| `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B`, etc. | ✅︎ | ✅︎ |
!!! important
For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
e.g.: `--pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
#### Token Classification
These models primarily support the [`LLM.encode`](./pooling_models.md#llmencode) API.
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
| ------------ | ------ | ----------------- | --------------------------- | --------------------------------------- |
| `BertForTokenClassification` | bert-based | `boltuix/NeuroBERT-NER` (see note), etc. | | |
| `ErnieForTokenClassification` | BERT-like Chinese ERNIE | `gyr66/Ernie-3.0-base-chinese-finetuned-ner` | | |
| `ModernBertForTokenClassification` | ModernBERT-based | `disham993/electrical-ner-ModernBERT-base` | | |
!!! note
Named Entity Recognition (NER) usage, please refer to [examples/pooling/token_classify/ner_offline.py](../../examples/pooling/token_classify/ner_offline.py), [examples/pooling/token_classify/ner_online.py](../../examples/pooling/token_classify/ner_online.py).
## List of Multimodal Language Models
The following modalities are supported depending on the model:
@@ -557,7 +707,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ |
| `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | T + I<sup>+</sup> + V<sup>+</sup> | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | |
| `HCXVisionV2ForCausalLM` | HyperCLOVAX-SEED-Think-32B | T + I<sup>+</sup> + V<sup>+</sup> | `naver-hyperclovax/HyperCLOVAX-SEED-Think-32B` | | |
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | ✅︎ | ✅︎ |
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ |
| `HunYuanVLForConditionalGeneration` | HunyuanOCR | T + I<sup>E+</sup> | `tencent/HunyuanOCR`, etc. | ✅︎ | ✅︎ |
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | |
| `IsaacForConditionalGeneration` | Isaac | T + I<sup>+</sup> | `PerceptronAI/Isaac-0.1` | ✅︎ | ✅︎ |
@@ -666,23 +816,56 @@ Speech2Text models trained specifically for Automatic Speech Recognition.
!!! note
`VoxtralForConditionalGeneration` requires `mistral-common[audio]` to be installed.
## Pooling Models
### Pooling Models
See [this page](pooling_models/README.md) for more information on how to use pooling models.
See [this page](./pooling_models.md) for more information on how to use pooling models.
!!! important
Since some model architectures support both generative and pooling tasks,
you should explicitly specify `--runner pooling` to ensure that the model is used in pooling mode instead of generative mode.
#### Embedding
See the link below for more information on the models supported for specific pooling tasks.
These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) API.
- [Classification Usages](pooling_models/classify.md)
- [Embedding Usages](pooling_models/embed.md)
- [Reward Usages](pooling_models/reward.md)
- [Token Classification Usages](pooling_models/token_classify.md)
- [Token Embedding Usages](pooling_models/token_embed.md)
- [Scoring Usages](pooling_models/scoring.md)
- [Specific Model Examples](pooling_models/specific_models.md)
!!! note
To get the best results, you should use pooling models that are specifically trained as such.
The following table lists those that are tested in vLLM.
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
| ------------ | ------ | ------ | ----------------- | -------------------- | ------------------------- |
| `CLIPModel` | CLIP | T / I | `openai/clip-vit-base-patch32`, `openai/clip-vit-large-patch14`, etc. | | |
| `ColModernVBertForRetrieval` | ColModernVBERT | T / I | `ModernVBERT/colmodernvbert-merged` | | |
| `ColPaliForRetrieval` | ColPali | T / I | `vidore/colpali-v1.3-hf` | | |
| `LlamaNemotronVLModel` | Llama Nemotron Embedding + SigLIP | T + I | `nvidia/llama-nemotron-embed-vl-1b-v2` | | |
| `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ |
| `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ |
| `Qwen3VLForConditionalGeneration`<sup>C</sup> | Qwen3-VL | T + I + V | `Qwen/Qwen3-VL-Embedding-2B`, etc. | ✅︎ | ✅︎ |
| `SiglipModel` | SigLIP, SigLIP2 | T / I | `google/siglip-base-patch16-224`, `google/siglip2-base-patch16-224` | | |
| `*ForConditionalGeneration`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | \* | N/A | \* | \* |
<sup>C</sup> Automatically converted into an embedding model via `--convert embed`. ([details](./pooling_models.md#model-conversion))
\* Feature support is the same as that of the original model.
---
#### Cross-encoder / Reranker
Cross-encoder and reranker models are a subset of classification models that accept two prompts as input.
These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) API.
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
| ------------ | ------ | ------ | ----------------- | -------------------- | ------------------------- |
| `JinaVLForSequenceClassification` | JinaVL-based | T + I<sup>E+</sup> | `jinaai/jina-reranker-m0`, etc. | ✅︎ | ✅︎ |
| `LlamaNemotronVLForSequenceClassification` | Llama Nemotron Reranker + SigLIP | T + I<sup>E+</sup> | `nvidia/llama-nemotron-rerank-vl-1b-v2` | | |
| `Qwen3VLForSequenceClassification` | Qwen3-VL-Reranker | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-Reranker-2B`(see note), etc. | ✅︎ | ✅︎ |
<sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion))
\* Feature support is the same as that of the original model.
!!! note
Similar to Qwen3-Reranker, you need to use the following `--hf_overrides` to load the official original `Qwen3-VL-Reranker`.
```bash
vllm serve Qwen/Qwen3-VL-Reranker-2B --hf_overrides '{"architectures": ["Qwen3VLForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
```
## Model Support Policy

View File

@@ -23,6 +23,7 @@ vLLM provides multiple communication backends for EP. Use `--all2all-backend` to
| `deepep_low_latency` | Multi-node decode | CUDA graph support, masked layout, optimized for decode | Decode-dominated workloads, low-latency scenarios |
| `flashinfer_nvlink_one_sided` | MNNVL systems | FlashInfer's one-sided A2A strategy for multi-node NVLink | High-throughput workloads |
| `flashinfer_nvlink_two_sided` | MNNVL systems | FlashInfer's two-sided A2A strategy for multi-node NVLink | Systems with NVLink across nodes |
| `naive` | Testing/debugging | Simple broadcast-based implementation | Debugging, not recommended for production |
## Single Node Deployment

View File

@@ -16,7 +16,7 @@ After initializing the `LLM` instance, use the available APIs to perform model i
The available APIs depend on the model type:
- [Generative models](../models/generative_models.md) output logprobs which are sampled from to obtain the final output text.
- [Pooling models](../models/pooling_models/README.md) output their hidden states directly.
- [Pooling models](../models/pooling_models.md) output their hidden states directly.
!!! info
[API Reference](../api/README.md#offline-inference)

View File

@@ -53,8 +53,8 @@ We currently support the following OpenAI APIs:
- Only applicable to [text generation models](../models/generative_models.md) with a [chat template](../serving/openai_compatible_server.md#chat-template).
- *Note: `user` parameter is ignored.*
- *Note:* Setting the `parallel_tool_calls` parameter to `false` ensures vLLM only returns zero or one tool call per request. Setting it to `true` (the default) allows returning more than one tool call per request. There is no guarantee more than one tool call will be returned if this is set to `true`, as that behavior is model dependent and not all models are designed to support parallel tool calls.
- [Embeddings API](../models/pooling_models/embed.md#openai-compatible-embeddings-api) (`/v1/embeddings`)
- Only applicable to [embedding models](../models/pooling_models/embed.md).
- [Embeddings API](#embeddings-api) (`/v1/embeddings`)
- Only applicable to [embedding models](../models/pooling_models.md).
- [Transcriptions API](#transcriptions-api) (`/v1/audio/transcriptions`)
- Only applicable to [Automatic Speech Recognition (ASR) models](../models/supported_models.md#transcription).
- [Translation API](#translations-api) (`/v1/audio/translations`)
@@ -66,19 +66,20 @@ In addition, we have the following custom APIs:
- [Tokenizer API](#tokenizer-api) (`/tokenize`, `/detokenize`)
- Applicable to any model with a tokenizer.
- [pooling API](../models/pooling_models/README.md#pooling-api) (`/pooling`)
- Applicable to all [pooling models](../models/pooling_models/README.md).
- [Classification API](../models/pooling_models/classify.md#classification-api) (`/classify`)
- Only applicable to [classification models](../models/pooling_models/classify.md).
- [Cohere Embed API](../models/pooling_models/embed.md#cohere-embed-api) (`/v2/embed`)
- [Pooling API](#pooling-api) (`/pooling`)
- Applicable to all [pooling models](../models/pooling_models.md).
- [Classification API](#classification-api) (`/classify`)
- Only applicable to [classification models](../models/pooling_models.md).
- [Score API](#score-api) (`/score`)
- Applicable to [embedding models and cross-encoder models](../models/pooling_models.md).
- [Cohere Embed API](#cohere-embed-api) (`/v2/embed`)
- Compatible with [Cohere's Embed API](https://docs.cohere.com/reference/embed)
- Works with any [embedding model](../models/pooling_models/embed.md#supported-models), including multimodal models.
- [Score API](../models/pooling_models/scoring.md#score-api) (`/score`)
- Applicable to [score models](../models/pooling_models/scoring.md).
- [Rerank API](../models/pooling_models/scoring.md#rerank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`)
- Implements [Jina AI's v1 rerank API](https://jina.ai/reranker/)
- Also compatible with [Cohere's v1 & v2 rerank APIs](https://docs.cohere.com/v2/reference/rerank)
- Works with any [embedding model](../models/pooling_models.md), including multimodal models.
- [Re-rank API](#re-rank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`)
- Implements [Jina AI's v1 re-rank API](https://jina.ai/reranker/)
- Also compatible with [Cohere's v1 & v2 re-rank APIs](https://docs.cohere.com/v2/reference/rerank)
- Jina and Cohere's APIs are very similar; Jina's includes extra information in the rerank endpoint's response.
- Only applicable to [cross-encoder models](../models/pooling_models.md).
## Chat Template
@@ -268,6 +269,300 @@ The following extra parameters in the response object are supported:
--8<-- "vllm/entrypoints/openai/responses/protocol.py:responses-response-extra-params"
```
### Embeddings API
Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings);
you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
Code example: [examples/pooling/embed/openai_embedding_client.py](../../examples/pooling/embed/openai_embedding_client.py)
If the model has a [chat template](../serving/openai_compatible_server.md#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat API](#chat-api))
which will be treated as a single prompt to the model. Here is a convenience function for calling the API while retaining OpenAI's type annotations:
??? code
```python
from openai import OpenAI
from openai._types import NOT_GIVEN, NotGiven
from openai.types.chat import ChatCompletionMessageParam
from openai.types.create_embedding_response import CreateEmbeddingResponse
def create_chat_embeddings(
client: OpenAI,
*,
messages: list[ChatCompletionMessageParam],
model: str,
encoding_format: Union[Literal["base64", "float"], NotGiven] = NOT_GIVEN,
) -> CreateEmbeddingResponse:
return client.post(
"/embeddings",
cast_to=CreateEmbeddingResponse,
body={"messages": messages, "model": model, "encoding_format": encoding_format},
)
```
#### Multi-modal inputs
You can pass multi-modal inputs to embedding models by defining a custom chat template for the server
and passing a list of `messages` in the request. Refer to the examples below for illustration.
=== "VLM2Vec"
To serve the model:
```bash
vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling \
--trust-remote-code \
--max-model-len 4096 \
--chat-template examples/pooling/embed/template/vlm2vec_phi3v.jinja
```
!!! important
Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--runner pooling`
to run this model in embedding mode instead of text generation mode.
The custom chat template is completely different from the original one for this model,
and can be found here: [examples/pooling/embed/template/vlm2vec_phi3v.jinja](../../examples/pooling/embed/template/vlm2vec_phi3v.jinja)
Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
??? code
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:8000/v1",
api_key="EMPTY",
)
image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
response = create_chat_embeddings(
client,
model="TIGER-Lab/VLM2Vec-Full",
messages=[
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": image_url}},
{"type": "text", "text": "Represent the given image."},
],
}
],
encoding_format="float",
)
print("Image embedding output:", response.data[0].embedding)
```
=== "DSE-Qwen2-MRL"
To serve the model:
```bash
vllm serve MrLight/dse-qwen2-2b-mrl-v1 --runner pooling \
--trust-remote-code \
--max-model-len 8192 \
--chat-template examples/pooling/embed/template/dse_qwen2_vl.jinja
```
!!! important
Like with VLM2Vec, we have to explicitly pass `--runner pooling`.
Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled
by a custom chat template: [examples/pooling/embed/template/dse_qwen2_vl.jinja](../../examples/pooling/embed/template/dse_qwen2_vl.jinja)
!!! important
`MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
example below for details.
Full example: [examples/pooling/embed/vision_embedding_online.py](../../examples/pooling/embed/vision_embedding_online.py)
#### Extra parameters
The following [pooling parameters][vllm.PoolingParams] are supported.
```python
--8<-- "vllm/pooling_params.py:common-pooling-params"
--8<-- "vllm/pooling_params.py:embed-pooling-params"
```
The following Embeddings API parameters are supported:
??? code
```python
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:completion-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:encoding-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:embed-params"
```
The following extra parameters are supported:
??? code
```python
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:completion-extra-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:encoding-extra-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:embed-extra-params"
```
For chat-like input (i.e. if `messages` is passed), the following parameters are supported:
The following parameters are supported by default:
??? code
```python
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:chat-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:encoding-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:embed-params"
```
these extra parameters are supported instead:
??? code
```python
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:chat-extra-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:encoding-extra-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:embed-extra-params"
```
### Cohere Embed API
Our API is also compatible with [Cohere's Embed v2 API](https://docs.cohere.com/reference/embed) which adds support for some modern embedding feature such as truncation, output dimensions, embedding types, and input types. This endpoint works with any embedding model (including multimodal models).
#### Cohere Embed API request parameters
| Parameter | Type | Required | Description |
| --------- | ---- | -------- | ----------- |
| `model` | string | Yes | Model name |
| `input_type` | string | No | Prompt prefix key (model-dependent, see below) |
| `texts` | list[string] | No | Text inputs (use one of `texts`, `images`, or `inputs`) |
| `images` | list[string] | No | Base64 data URI images |
| `inputs` | list[object] | No | Mixed text and image content objects |
| `embedding_types` | list[string] | No | Output types (default: `["float"]`) |
| `output_dimension` | int | No | Truncate embeddings to this dimension (Matryoshka) |
| `truncate` | string | No | `END`, `START`, or `NONE` (default: `END`) |
#### Text embedding
```bash
curl -X POST "http://localhost:8000/v2/embed" \
-H "Content-Type: application/json" \
-d '{
"model": "Snowflake/snowflake-arctic-embed-m-v1.5",
"input_type": "query",
"texts": ["Hello world", "How are you?"],
"embedding_types": ["float"]
}'
```
??? console "Response"
```json
{
"id": "embd-...",
"embeddings": {
"float": [
[0.012, -0.034, ...],
[0.056, 0.078, ...]
]
},
"texts": ["Hello world", "How are you?"],
"meta": {
"api_version": {"version": "2"},
"billed_units": {"input_tokens": 12}
}
}
```
#### Mixed text and image inputs
For multimodal models, you can embed images by passing base64 data URIs. The `inputs` field accepts a list of objects with mixed text and image content:
```bash
curl -X POST "http://localhost:8000/v2/embed" \
-H "Content-Type: application/json" \
-d '{
"model": "google/siglip-so400m-patch14-384",
"inputs": [
{
"content": [
{"type": "text", "text": "A photo of a cat"},
{"type": "image_url", "image_url": {"url": "data:image/png;base64,iVBOR..."}}
]
}
],
"embedding_types": ["float"]
}'
```
#### Embedding types
The `embedding_types` parameter controls the output format. Multiple types can be requested in a single call:
| Type | Description |
| ---- | ----------- |
| `float` | Raw float32 embeddings (default) |
| `binary` | Bit-packed signed binary |
| `ubinary` | Bit-packed unsigned binary |
| `base64` | Little-endian float32 encoded as base64 |
```bash
curl -X POST "http://localhost:8000/v2/embed" \
-H "Content-Type: application/json" \
-d '{
"model": "Snowflake/snowflake-arctic-embed-m-v1.5",
"input_type": "query",
"texts": ["What is machine learning?"],
"embedding_types": ["float", "binary"]
}'
```
??? console "Response"
```json
{
"id": "embd-...",
"embeddings": {
"float": [[0.012, -0.034, ...]],
"binary": [[42, -117, ...]]
},
"texts": ["What is machine learning?"],
"meta": {
"api_version": {"version": "2"},
"billed_units": {"input_tokens": 8}
}
}
```
#### Truncation
The `truncate` parameter controls how inputs exceeding the model's maximum sequence length are handled:
| Value | Behavior |
| ----- | --------- |
| `END` (default) | Keep the first tokens, drop the end |
| `START` | Keep the last tokens, drop the beginning |
| `NONE` | Return an error if the input is too long |
#### Input type and prompt prefixes
The `input_type` field selects a prompt prefix to prepend to each text input. The available values
depend on the model:
- **Models with `task_instructions` in `config.json`**: The keys from the `task_instructions` dict are
the valid `input_type` values and the corresponding value is prepended to each text.
- **Models with `config_sentence_transformers.json` prompts**: The keys from the `prompts` dict are
the valid `input_type` values. For example, `Snowflake/snowflake-arctic-embed-xs` defines `"query"`,
so setting `input_type: "query"` prepends `"Represent this sentence for searching relevant passages: "`.
- **Other models**: `input_type` is not accepted and will raise a validation error if passed.
### Transcriptions API
Our Transcriptions API is compatible with [OpenAI's Transcriptions API](https://platform.openai.com/docs/api-reference/audio/createTranscription);
@@ -464,8 +759,172 @@ It consists of two endpoints:
- `/tokenize` corresponds to calling `tokenizer.encode()`.
- `/detokenize` corresponds to calling `tokenizer.decode()`.
### Pooling API
Our Pooling API encodes input prompts using a [pooling model](../models/pooling_models.md) and returns the corresponding hidden states.
The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats.
Code example: [examples/pooling/pooling/pooling_online.py](../../examples/pooling/pooling/pooling_online.py)
### Classification API
Our Classification API directly supports Hugging Face sequence-classification models such as [ai21labs/Jamba-tiny-reward-dev](https://huggingface.co/ai21labs/Jamba-tiny-reward-dev) and [jason9693/Qwen2.5-1.5B-apeach](https://huggingface.co/jason9693/Qwen2.5-1.5B-apeach).
We automatically wrap any other transformer via `as_seq_cls_model()`, which pools on the last token, attaches a `RowParallelLinear` head, and applies a softmax to produce per-class probabilities.
Code example: [examples/pooling/classify/classification_online.py](../../examples/pooling/classify/classification_online.py)
#### Example Requests
You can classify multiple texts by passing an array of strings:
```bash
curl -v "http://127.0.0.1:8000/classify" \
-H "Content-Type: application/json" \
-d '{
"model": "jason9693/Qwen2.5-1.5B-apeach",
"input": [
"Loved the new café—coffee was great.",
"This update broke everything. Frustrating."
]
}'
```
??? console "Response"
```json
{
"id": "classify-7c87cac407b749a6935d8c7ce2a8fba2",
"object": "list",
"created": 1745383065,
"model": "jason9693/Qwen2.5-1.5B-apeach",
"data": [
{
"index": 0,
"label": "Default",
"probs": [
0.565970778465271,
0.4340292513370514
],
"num_classes": 2
},
{
"index": 1,
"label": "Spoiled",
"probs": [
0.26448777318000793,
0.7355121970176697
],
"num_classes": 2
}
],
"usage": {
"prompt_tokens": 20,
"total_tokens": 20,
"completion_tokens": 0,
"prompt_tokens_details": null
}
}
```
You can also pass a string directly to the `input` field:
```bash
curl -v "http://127.0.0.1:8000/classify" \
-H "Content-Type: application/json" \
-d '{
"model": "jason9693/Qwen2.5-1.5B-apeach",
"input": "Loved the new café—coffee was great."
}'
```
??? console "Response"
```json
{
"id": "classify-9bf17f2847b046c7b2d5495f4b4f9682",
"object": "list",
"created": 1745383213,
"model": "jason9693/Qwen2.5-1.5B-apeach",
"data": [
{
"index": 0,
"label": "Default",
"probs": [
0.565970778465271,
0.4340292513370514
],
"num_classes": 2
}
],
"usage": {
"prompt_tokens": 10,
"total_tokens": 10,
"completion_tokens": 0,
"prompt_tokens_details": null
}
}
```
#### Extra parameters
The following [pooling parameters][vllm.PoolingParams] are supported.
```python
--8<-- "vllm/pooling_params.py:common-pooling-params"
--8<-- "vllm/pooling_params.py:classify-pooling-params"
```
The following Classification API parameters are supported:
??? code
```python
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:completion-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-params"
```
The following extra parameters are supported:
??? code
```python
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:completion-extra-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params"
```
For chat-like input (i.e. if `messages` is passed), the following parameters are supported:
??? code
```python
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:chat-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-params"
```
these extra parameters are supported instead:
??? code
```python
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:chat-extra-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params"
```
### Score API
Our Score API can apply a cross-encoder model or an embedding model to predict scores for sentence or multimodal pairs. When using an embedding model the score corresponds to the cosine similarity between each embedding pair.
Usually, the score for a sentence pair refers to the similarity between two sentences, on a scale of 0 to 1.
You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
Code example: [examples/pooling/score/score_api_online.py](../../examples/pooling/score/score_api_online.py)
#### Score Template
Some scoring models require a specific prompt format to work correctly. You can specify a custom score template using the `--chat-template` parameter (see [Chat Template](#chat-template)).
@@ -481,6 +940,307 @@ This approach is more robust than index-based access (`messages[0]`, `messages[1
Example template file: [examples/pooling/score/template/nemotron-rerank.jinja](../../examples/pooling/score/template/nemotron-rerank.jinja)
#### Single inference
You can pass a string to both `queries` and `documents`, forming a single sentence pair.
```bash
curl -X 'POST' \
'http://127.0.0.1:8000/score' \
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-d '{
"model": "BAAI/bge-reranker-v2-m3",
"encoding_format": "float",
"queries": "What is the capital of France?",
"documents": "The capital of France is Paris."
}'
```
??? console "Response"
```json
{
"id": "score-request-id",
"object": "list",
"created": 693447,
"model": "BAAI/bge-reranker-v2-m3",
"data": [
{
"index": 0,
"object": "score",
"score": 1
}
],
"usage": {}
}
```
#### Batch inference
You can pass a string to `queries` and a list to `documents`, forming multiple sentence pairs
where each pair is built from `queries` and a string in `documents`.
The total number of pairs is `len(documents)`.
??? console "Request"
```bash
curl -X 'POST' \
'http://127.0.0.1:8000/score' \
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-d '{
"model": "BAAI/bge-reranker-v2-m3",
"queries": "What is the capital of France?",
"documents": [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris."
]
}'
```
??? console "Response"
```json
{
"id": "score-request-id",
"object": "list",
"created": 693570,
"model": "BAAI/bge-reranker-v2-m3",
"data": [
{
"index": 0,
"object": "score",
"score": 0.001094818115234375
},
{
"index": 1,
"object": "score",
"score": 1
}
],
"usage": {}
}
```
You can pass a list to both `queries` and `documents`, forming multiple sentence pairs
where each pair is built from a string in `queries` and the corresponding string in `documents` (similar to `zip()`).
The total number of pairs is `len(documents)`.
??? console "Request"
```bash
curl -X 'POST' \
'http://127.0.0.1:8000/score' \
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-d '{
"model": "BAAI/bge-reranker-v2-m3",
"encoding_format": "float",
"queries": [
"What is the capital of Brazil?",
"What is the capital of France?"
],
"documents": [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris."
]
}'
```
??? console "Response"
```json
{
"id": "score-request-id",
"object": "list",
"created": 693447,
"model": "BAAI/bge-reranker-v2-m3",
"data": [
{
"index": 0,
"object": "score",
"score": 1
},
{
"index": 1,
"object": "score",
"score": 1
}
],
"usage": {}
}
```
#### Multi-modal inputs
You can pass multi-modal inputs to scoring models by passing `content` including a list of multi-modal input (image, etc.) in the request. Refer to the examples below for illustration.
=== "JinaVL-Reranker"
To serve the model:
```bash
vllm serve jinaai/jina-reranker-m0
```
Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
??? Code
```python
import requests
response = requests.post(
"http://localhost:8000/v1/score",
json={
"model": "jinaai/jina-reranker-m0",
"queries": "slm markdown",
"documents": [
{
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
},
}
],
},
{
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
},
}
]
},
],
},
)
response.raise_for_status()
response_json = response.json()
print("Scoring output:", response_json["data"][0]["score"])
print("Scoring output:", response_json["data"][1]["score"])
```
Full example:
- [examples/pooling/score/vision_score_api_online.py](../../examples/pooling/score/vision_score_api_online.py)
- [examples/pooling/score/vision_rerank_api_online.py](../../examples/pooling/score/vision_rerank_api_online.py)
#### Extra parameters
The following [pooling parameters][vllm.PoolingParams] are supported.
```python
--8<-- "vllm/pooling_params.py:common-pooling-params"
--8<-- "vllm/pooling_params.py:classify-pooling-params"
```
The following Score API parameters are supported:
```python
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params"
```
The following extra parameters are supported:
```python
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params"
```
### Re-rank API
Our Re-rank API can apply an embedding model or a cross-encoder model to predict relevant scores between a single query, and
each of a list of documents. Usually, the score for a sentence pair refers to the similarity between two sentences or multi-modal inputs (image, etc.), on a scale of 0 to 1.
You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
The rerank endpoints support popular re-rank models such as `BAAI/bge-reranker-base` and other models supporting the
`score` task. Additionally, `/rerank`, `/v1/rerank`, and `/v2/rerank`
endpoints are compatible with both [Jina AI's re-rank API interface](https://jina.ai/reranker/) and
[Cohere's re-rank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with
popular open-source tools.
Code example: [examples/pooling/score/rerank_api_online.py](../../examples/pooling/score/rerank_api_online.py)
#### Example Request
Note that the `top_n` request parameter is optional and will default to the length of the `documents` field.
Result documents will be sorted by relevance, and the `index` property can be used to determine original order.
??? console "Request"
```bash
curl -X 'POST' \
'http://127.0.0.1:8000/v1/rerank' \
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-d '{
"model": "BAAI/bge-reranker-base",
"query": "What is the capital of France?",
"documents": [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
"Horses and cows are both animals"
]
}'
```
??? console "Response"
```json
{
"id": "rerank-fae51b2b664d4ed38f5969b612edff77",
"model": "BAAI/bge-reranker-base",
"usage": {
"total_tokens": 56
},
"results": [
{
"index": 1,
"document": {
"text": "The capital of France is Paris."
},
"relevance_score": 0.99853515625
},
{
"index": 0,
"document": {
"text": "The capital of Brazil is Brasilia."
},
"relevance_score": 0.0005860328674316406
}
]
}
```
#### Extra parameters
The following [pooling parameters][vllm.PoolingParams] are supported.
```python
--8<-- "vllm/pooling_params.py:common-pooling-params"
--8<-- "vllm/pooling_params.py:classify-pooling-params"
```
The following Re-rank API parameters are supported:
```python
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params"
```
The following extra parameters are supported:
```python
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params"
```
## Ray Serve LLM
Ray Serve LLM enables scalable, production-grade serving of the vLLM engine. It integrates tightly with vLLM and extends it with features such as auto-scaling, load balancing, and back-pressure.

View File

@@ -1,63 +0,0 @@
# Async Reinforcement Learning
## Overview
In a standard RL training loop, generation and training happen sequentially: the policy generates rollouts, then training runs on those rollouts, and the cycle repeats. During generation the training accelerators sit idle, and vice versa.
The **one-off pipelining** approach separates the generation and training phases into two parallel coroutines, allowing the model to generate new samples while simultaneously training on previously generated data. This can lead to better GPU utilization and greater training throughput.
However, this overlap introduces a complication: weights must be updated in the inference engine mid-flight, while requests may still be in progress.
## The Pause and Resume API
To safely update weights while the inference engine is running, vLLM provides `pause_generation` and `resume_generation` methods. These let the trainer coordinate a clean window for weight synchronization without losing in-flight work.
### pause_generation
```python
await engine.pause_generation(mode="keep", clear_cache=True)
```
The `mode` parameter controls how in-flight requests are handled:
| Mode | Behavior |
| ---- | -------- |
| `"abort"` | Abort all in-flight requests immediately and return partial results (default) |
| `"wait"` | Wait for all in-flight requests to finish before pausing |
| `"keep"` | Freeze requests in the queue; they resume when `resume_generation` is called |
The `clear_cache` parameter controls whether to clear the KV cache and prefix cache after pausing.
### resume_generation
```python
await engine.resume_generation()
```
Resumes the scheduler after a pause. Any requests frozen with `mode="keep"` will continue generating.
### HTTP Endpoints
When using the vLLM HTTP server, the same functionality is available via:
- `POST /pause?mode=keep` - Pause generation
- `POST /resume` - Resume generation
!!! note "Data Parallelism"
When using data parallelism with vLLM's **internal load balancer** (i.e. `data_parallel_backend="ray"`), pause and resume are handled automatically across all DP ranks -- a single call is sufficient. When using an **external load balancer** (i.e. multiple independent vLLM instances behind a proxy), you must send pause and resume requests to **every** engine instance individually before and after the weight update.
## Typical Async RL Flow
A typical async RL loop with weight syncing looks like this:
1. Start generating rollouts from the current policy
2. Once trainer has new weights to update to, pause generation with `mode="keep"`
3. Sync the updated weights from the trainer to the inference engine (see [Weight Transfer](weight_transfer/README.md))
4. Resume generation -- in-flight requests continue with the new weights
5. Repeat
The key insight is that requests paused with `mode="keep"` will produce tokens from the **old** weights before the pause and tokens from the **new** weights after resume. The `clear_cache` parameter controls whether the KV cache is invalidated during the pause. When `clear_cache=True`, previously cached key-value entries are discarded, so all tokens generated after resume will be computed entirely with the new weights. When `clear_cache=False`, existing KV cache entries are retained, meaning some tokens in context may still reflect the old weights (stale KV cache).
## Example
The [async RLHF example](../examples/rl/rlhf_async_new_apis.md) demonstrates this pattern with `vllm.AsyncLLMEngine`, NCCL weight transfer, and mid-flight pause/resume with validation.

View File

@@ -16,9 +16,11 @@ The following open-source RL libraries use vLLM for fast rollouts (sorted alphab
- [Unsloth](https://github.com/unslothai/unsloth)
- [verl](https://github.com/volcengine/verl)
For weight synchronization between training and inference, see the [Weight Transfer](weight_transfer/README.md) documentation, which covers the pluggable backend system with [NCCL](weight_transfer/nccl.md) (multi-GPU) and [IPC](weight_transfer/ipc.md) (same-GPU) engines.
See the following basic examples to get started if you don't want to use an existing library:
For pipelining generation and training to improve GPU utilization and throughput, see the [Async Reinforcement Learning](async_rl.md) guide, which covers the pause/resume API for safely updating weights mid-flight.
- [Training and inference processes are located on separate GPUs (inspired by OpenRLHF)](../examples/offline_inference/rlhf.md)
- [Training and inference processes are colocated on the same GPUs using Ray](../examples/offline_inference/rlhf_colocate.md)
- [Utilities for performing RLHF with vLLM](../examples/offline_inference/rlhf_utils.md)
See the following notebooks showing how to use vLLM for GRPO:

View File

@@ -1,78 +0,0 @@
# Weight Transfer
vLLM provides a pluggable weight transfer system for synchronizing model weights from a training process to the inference engine during reinforcement learning (RL) workflows. This is essential for RLHF, GRPO, and other online RL methods where the policy model is iteratively updated during training and the updated weights must be reflected in the inference engine for rollout generation.
## Architecture
The weight transfer system follows a **two-phase protocol** with a pluggable backend design:
1. **Initialization** (`init_weight_transfer_engine`): Establishes the communication channel between the trainer and inference workers. Called once before the training loop begins.
2. **Weight Update** (`update_weights`): Transfers updated weights from the trainer to the inference engine. Called after each training step (or batch of steps).
## Available Backends
| Backend | Transport | Use Case |
| ------- | --------- | -------- |
| [NCCL](nccl.md) | NCCL broadcast | Separate GPUs for training and inference |
| [IPC](ipc.md) | CUDA IPC handles | Colocated training and inference on same GPU |
## Configuration
Specify the weight transfer backend through `WeightTransferConfig`. The backend determines which engine handles the weight synchronization.
### Programmatic (Offline Inference)
```python
from vllm import LLM
from vllm.config import WeightTransferConfig
llm = LLM(
model="my-model",
weight_transfer_config=WeightTransferConfig(backend="nccl"), # or "ipc"
)
```
### CLI (Online Serving)
```bash
vllm serve my-model \
--weight-transfer-config '{"backend": "nccl"}'
```
The `backend` field accepts `"nccl"` (default) or `"ipc"`.
## API Endpoints
When running vLLM as an HTTP server, the following endpoints are available for weight transfer:
| Endpoint | Method | Description |
| -------- | ------ | ----------- |
| `/init_weight_transfer_engine` | POST | Initialize the weight transfer engine with backend-specific info |
| `/update_weights` | POST | Trigger a weight update with backend-specific metadata |
| `/pause` | POST | Pause generation before weight sync to handle inflight requests |
| `/resume` | POST | Resume generation after weight sync |
| `/get_world_size` | GET | Get the number of inference workers (useful for NCCL world size calculation) |
!!! note
The HTTP weight transfer endpoints require `VLLM_SERVER_DEV_MODE=1` to be set.
## Trainer-Side API
Both backends provide static methods that the trainer calls to send weights. The general pattern is:
```python
# 1. Initialize the transfer engine (backend-specific)
EngineClass.trainer_init(init_info)
# 2. Send weights to inference workers
EngineClass.trainer_send_weights(
iterator=model.named_parameters(),
trainer_args=backend_specific_args,
)
```
See the [NCCL](nccl.md) and [IPC](ipc.md) pages for backend-specific trainer APIs and full examples.
## Extending the System
The weight transfer system is designed to be extensible. You can implement custom backends by subclassing `WeightTransferEngine` and registering them with the factory. See the [Base Class](base.md) page for details.

View File

@@ -1,162 +0,0 @@
# Base Class and Custom Engines
The weight transfer system is built on an abstract base class that defines the contract between vLLM's worker infrastructure and the transport backend. You can implement custom backends by subclassing `WeightTransferEngine` and registering them with the `WeightTransferEngineFactory`.
## WeightTransferEngine
The `WeightTransferEngine` is a generic abstract class parameterized by two dataclass types:
- **`TInitInfo`** (extends `WeightTransferInitInfo`): Backend-specific initialization parameters.
- **`TUpdateInfo`** (extends `WeightTransferUpdateInfo`): Backend-specific weight update metadata.
### Abstract Methods
Subclasses must implement these four methods:
| Method | Side | Description |
| ------ | ---- | ----------- |
| `init_transfer_engine(init_info)` | Inference | Initialize the communication channel on each inference worker |
| `receive_weights(update_info, load_weights)` | Inference | Receive weights and call `load_weights` incrementally |
| `shutdown()` | Inference | Clean up resources |
| `trainer_send_weights(iterator, trainer_args)` | Trainer | Static method to send weights from the trainer process |
### Request Classes
The API-level request classes provide backend-agnostic serialization using plain dictionaries. The engine's `parse_init_info` and `parse_update_info` methods convert these dictionaries into typed dataclasses.
```python
from vllm.distributed.weight_transfer.base import (
WeightTransferInitRequest,
WeightTransferUpdateRequest,
)
# Init request (dict is converted to backend-specific TInitInfo)
init_request = WeightTransferInitRequest(
init_info={"master_address": "10.0.0.1", "master_port": 29500, ...}
)
# Update request (dict is converted to backend-specific TUpdateInfo)
update_request = WeightTransferUpdateRequest(
update_info={"names": [...], "dtype_names": [...], "shapes": [...]}
)
```
### WeightTransferUpdateInfo
The base `WeightTransferUpdateInfo` includes an `is_checkpoint_format` flag:
```python
@dataclass
class WeightTransferUpdateInfo(ABC):
is_checkpoint_format: bool = True
```
When `is_checkpoint_format=True` (the default), vLLM applies layerwise weight processing (repacking, renaming, etc.) on the received weights before loading them. Set to `False` if the trainer has already converted weights to the kernel format expected by the model.
## Implementing a Custom Engine
To create a custom weight transfer backend:
### 1. Define Info Dataclasses
```python
from dataclasses import dataclass
from vllm.distributed.weight_transfer.base import (
WeightTransferEngine,
WeightTransferInitInfo,
WeightTransferUpdateInfo,
)
@dataclass
class MyInitInfo(WeightTransferInitInfo):
endpoint: str
token: str
@dataclass
class MyUpdateInfo(WeightTransferUpdateInfo):
names: list[str]
dtype_names: list[str]
shapes: list[list[int]]
# Add custom fields as needed
```
### 2. Implement the Engine
```python
from collections.abc import Callable, Iterator
from typing import Any
import torch
class MyWeightTransferEngine(WeightTransferEngine[MyInitInfo, MyUpdateInfo]):
init_info_cls = MyInitInfo
update_info_cls = MyUpdateInfo
def init_transfer_engine(self, init_info: MyInitInfo) -> None:
# Set up connection to trainer using init_info.endpoint, etc.
...
def receive_weights(
self,
update_info: MyUpdateInfo,
load_weights: Callable[[list[tuple[str, torch.Tensor]]], None],
) -> None:
# Receive each weight and call load_weights incrementally
for name, dtype_name, shape in zip(
update_info.names, update_info.dtype_names, update_info.shapes
):
dtype = getattr(torch, dtype_name)
weight = self._fetch_weight(name, shape, dtype)
load_weights([(name, weight)])
def shutdown(self) -> None:
# Clean up resources
...
@staticmethod
def trainer_send_weights(
iterator: Iterator[tuple[str, torch.Tensor]],
trainer_args: dict[str, Any],
) -> None:
# Send weights from the trainer process
for name, tensor in iterator:
# Send tensor via custom transport
...
```
!!! important
The `load_weights` callable passed to `receive_weights` should be called **incrementally** (one or a few weights at a time) rather than accumulating all weights first. This avoids GPU out-of-memory errors with large models.
### 3. Register with the Factory
```python
from vllm.distributed.weight_transfer.factory import WeightTransferEngineFactory
# Option 1: Lazy loading (recommended for built-in engines)
WeightTransferEngineFactory.register_engine(
"my_backend",
"my_package.my_module",
"MyWeightTransferEngine",
)
# Option 2: Direct class registration
WeightTransferEngineFactory.register_engine(
"my_backend",
MyWeightTransferEngine,
)
```
Once registered, users can select your backend via `WeightTransferConfig(backend="my_backend")`.
## WeightTransferEngineFactory
The factory uses a registry pattern with lazy loading. Built-in engines (`nccl` and `ipc`) are registered at import time but their modules are only loaded when the backend is actually requested. This avoids importing heavy dependencies (like NCCL communicators) when they aren't needed.
```python
from vllm.distributed.weight_transfer.factory import WeightTransferEngineFactory
# Create an engine from config
engine = WeightTransferEngineFactory.create_engine(
config=weight_transfer_config,
parallel_config=parallel_config,
)
```

View File

@@ -1,73 +0,0 @@
# IPC Engine
The IPC weight transfer engine uses **CUDA IPC** (Inter-Process Communication) handles to share GPU memory directly between the trainer and inference workers on the **same node and same GPU**. This avoids any data copying, making it a efficient option when colocating training and inference.
## When to Use IPC
- Training and inference on the **same GPU** (colocated)
- You want to minimize memory overhead by sharing tensors in-place
## How It Works
1. The trainer creates CUDA tensors for each weight and generates IPC handles using `torch.multiprocessing.reductions.reduce_tensor`.
2. IPC handles are sent to the inference engine via **Ray.remote()** or **HTTP POST**.
3. The inference worker reconstructs the tensors from the handles, reading directly from the trainer's GPU memory.
!!! warning
IPC handles involve sending serialized Python objects. When using HTTP transport, you must set `VLLM_ALLOW_INSECURE_SERIALIZATION=1` on both the server and client. This is because IPC handles are pickled and base64-encoded for HTTP transmission.
## Initialization
The IPC backend requires no initialization on either side. The `init_transfer_engine` call is a no-op for IPC.
## Sending Weights
IPC supports two transport modes for delivering the handles:
### Ray Mode
Used when vLLM is running as a Ray actor:
```python
from vllm.distributed.weight_transfer.ipc_engine import (
IPCTrainerSendWeightsArgs,
IPCWeightTransferEngine,
)
trainer_args = IPCTrainerSendWeightsArgs(
mode="ray",
llm_handle=llm_actor_handle,
)
IPCWeightTransferEngine.trainer_send_weights(
iterator=model.named_parameters(),
trainer_args=trainer_args,
)
```
In Ray mode, the engine calls `llm_handle.update_weights.remote(...)` directly, passing the IPC handles via Ray's serialization.
### HTTP Mode
Used when vLLM is running as an HTTP server:
```python
trainer_args = IPCTrainerSendWeightsArgs(
mode="http",
url="http://localhost:8000",
)
IPCWeightTransferEngine.trainer_send_weights(
iterator=model.named_parameters(),
trainer_args=trainer_args,
)
```
In HTTP mode, IPC handles are pickled, base64-encoded, and sent as JSON to the `/update_weights` endpoint.
See [`IPCTrainerSendWeightsArgs`](https://github.com/vllm-project/vllm/blob/main/vllm/distributed/weight_transfer/ipc_engine.py) for the full list of configurable fields.
## Examples
- [RLHF with IPC weight syncing (offline, Ray)](../../examples/rl/rlhf_ipc.md) - Colocated training and inference on a single GPU using Ray placement groups and CUDA IPC handles
- [RLHF with IPC weight syncing (online serving, HTTP)](../../examples/rl/rlhf_http_ipc.md) - Weight transfer with a vLLM HTTP server where both server and trainer share the same GPU

View File

@@ -1,110 +0,0 @@
# NCCL Engine
The NCCL weight transfer engine uses [NCCL](https://developer.nvidia.com/nccl) broadcast operations to transfer weights from the trainer to inference workers. It supports **multi-node** and **multi-GPU** setups where the trainer and inference engine run on separate GPUs.
## When to Use NCCL
- Training and inference on **separate GPUs** (possibly across nodes)
- **Tensor-parallel** inference with multiple workers that all need the updated weights
- You need high-bandwidth, low-latency weight transfer over NVLink or InfiniBand
## How It Works
1. The trainer and all inference workers join a shared NCCL process group using `StatelessProcessGroup` (vLLM's torch.distributed-independent group abstraction).
2. The trainer broadcasts weights to all workers simultaneously. Each worker receives and loads weights incrementally.
3. Optionally, **packed tensor broadcasting** batches multiple small tensors into larger buffers with double/triple buffering and CUDA stream overlap for higher throughput. This implementation is based on [NeMo-RL's packed tensor](https://github.com/NVIDIA-NeMo/RL/blob/main/nemo_rl/utils/packed_tensor.py).
## Initialization
NCCL requires explicit process group setup. The trainer and inference workers must agree on a master address, port, and world size.
### Inference Side
```python
from vllm.distributed.weight_transfer.base import WeightTransferInitRequest
# rank_offset accounts for the trainer occupying rank 0
llm.init_weight_transfer_engine(
WeightTransferInitRequest(
init_info=dict(
master_address=master_address,
master_port=master_port,
rank_offset=1,
world_size=world_size, # trainer + all inference workers
)
)
)
```
### Trainer Side
```python
from vllm.distributed.weight_transfer.nccl_engine import (
NCCLWeightTransferEngine,
)
group = NCCLWeightTransferEngine.trainer_init(
dict(
master_address=master_address,
master_port=master_port,
world_size=world_size,
)
)
```
!!! note
`trainer_init` always assigns the trainer to rank 0. Inference workers start at `rank_offset` (typically 1).
## Sending Weights
```python
from vllm.distributed.weight_transfer.nccl_engine import (
NCCLTrainerSendWeightsArgs,
NCCLWeightTransferEngine,
)
trainer_args = NCCLTrainerSendWeightsArgs(
group=group,
packed=True, # use packed broadcasting for efficiency
)
NCCLWeightTransferEngine.trainer_send_weights(
iterator=model.named_parameters(),
trainer_args=trainer_args,
)
```
See [`NCCLTrainerSendWeightsArgs`](https://github.com/vllm-project/vllm/blob/main/vllm/distributed/weight_transfer/nccl_engine.py) for the full list of configurable fields.
### Packed Tensor Broadcasting
When `packed=True`, multiple weight tensors are packed into large contiguous buffers before broadcasting. This reduces the number of NCCL operations and uses double/triple buffering with dedicated CUDA streams for overlap between packing, broadcasting, and unpacking.
Both the trainer (`NCCLTrainerSendWeightsArgs`) and inference side (`NCCLWeightTransferUpdateInfo`) must use matching `packed_buffer_size_bytes` and `packed_num_buffers` values.
## Receiving Weights (Inference Side)
The inference side triggers weight reception by calling `update_weights`:
```python
from vllm.distributed.weight_transfer.base import WeightTransferUpdateRequest
llm.update_weights(
WeightTransferUpdateRequest(
update_info=dict(
names=names,
dtype_names=dtype_names,
shapes=shapes,
packed=True,
)
)
)
```
The `names`, `dtype_names`, and `shapes` lists describe each parameter. These must match the order in which the trainer iterates over its parameters.
## Examples
- [RLHF with NCCL weight syncing (offline, Ray)](../../examples/rl/rlhf_nccl.md) - Trainer on one GPU, 2x tensor-parallel vLLM engine on two others, with packed NCCL weight broadcast
- [RLHF with async weight syncing (offline, Ray)](../../examples/rl/rlhf_async_new_apis.md) - Async generation with mid-flight pause, weight sync, resume, and validation against a fresh model
- [RLHF with NCCL weight syncing (online serving, HTTP)](../../examples/rl/rlhf_http_nccl.md) - Weight transfer with a running vLLM HTTP server using HTTP control plane and NCCL data plane

View File

@@ -70,29 +70,6 @@ def run_audioflamingo3(question: str, audio_count: int) -> ModelRequestData:
)
# CohereASR
def run_cohere_asr(question: str, audio_count: int) -> ModelRequestData:
assert audio_count == 1, "CohereASR only support single audio input per prompt"
# TODO (ekagra): add HF ckpt after asr release
model_name = "/host/engines/vllm/audio/2b-release"
prompt = (
"<|startofcontext|><|startoftranscript|>"
"<|emo:undefined|><|en|><|en|><|pnc|><|noitn|>"
"<|notimestamp|><|nodiarize|>"
)
engine_args = EngineArgs(
model=model_name,
limit_mm_per_prompt={"audio": audio_count},
trust_remote_code=True,
)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
)
# MusicFlamingo
def run_musicflamingo(question: str, audio_count: int) -> ModelRequestData:
model_name = "nvidia/music-flamingo-2601-hf"
@@ -531,15 +508,14 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:
model_example_map = {
"audioflamingo3": run_audioflamingo3,
"cohere_asr": run_cohere_asr,
"funaudiochat": run_funaudiochat,
"musicflamingo": run_musicflamingo,
"gemma3n": run_gemma3n,
"glmasr": run_glmasr,
"funaudiochat": run_funaudiochat,
"granite_speech": run_granite_speech,
"kimi_audio": run_kimi_audio,
"midashenglm": run_midashenglm,
"minicpmo": run_minicpmo,
"musicflamingo": run_musicflamingo,
"phi4_mm": run_phi4mm,
"qwen2_audio": run_qwen2_audio,
"qwen2_5_omni": run_qwen2_5_omni,

View File

@@ -2,38 +2,25 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Demonstrates async reinforcement learning using vLLM and Ray,
with native weight syncing APIs and batch-invariant generation.
with native weight syncing APIs at engine instance.
The script separates training and inference workloads onto distinct GPUs
so that Ray can manage process placement and inter-process communication.
A Hugging Face Transformer model occupies one GPU for training, and a
vLLM AsyncLLMEngine occupies another GPU for inference.
Batch invariance is enabled so that generation output is deterministic
regardless of how many requests are batched together. This is required
for the validation phase to succeed. Batch invariance currently requires
NVIDIA GPUs with compute capability 9.0 or higher:
- H-series: H100, H200
- B-series: B100, B200
A Hugging Face Transformer model occupies one GPU for training, whereas a
2x tensor-parallel vLLM inference engine occupies two GPUs.
The example performs the following steps:
* Load the training model (Qwen3-1.7B) on one GPU via a Ray actor.
* Initialize the inference engine with a base model (Qwen3-1.7B-Base)
on a separate GPU using vLLM's AsyncLLMEngine with Ray as the
distributed executor backend.
* Set up an NCCL-based weight transfer channel between the trainer
and the inference engine.
* Submit generation requests for a batch of prompts.
* Pause generation once any request reaches a token threshold.
* Broadcast the training model's weights to the inference engine
via the NCCL weight transfer engine, replacing the base weights.
* Resume generation and collect results, noting which tokens were
generated before vs. after the weight swap.
* Validate correctness by launching a fresh vLLM instance loaded
directly with the training model and comparing its output to the
post-swap tokens from the weight-synced engine.
* Load the training model on one gpu (scheduled via ray)
* Initialize the inference model with dummy weights across
two gpus using vLLM's tensor parallelism and Ray placement groups.
* Generate gibberish from a list of prompts using the randomly initialized
inference engine.
* Pause generation once generation completes for one sequence
* Update the weights of the training model and broadcast the updated weights
to the inference engine by using a Ray collective RPC group.
* Resume generation and print out the results
This example assumes a single-node cluster with two GPUs, but Ray
This example assumes a single-node cluster with three GPUs, but Ray
supports multi-node clusters. vLLM expects the GPUs are only used for vLLM
workloads. Residual GPU activity interferes with vLLM memory profiling and
causes unexpected behavior.

View File

@@ -0,0 +1,147 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Demonstrates reinforcement learning from human feedback (RLHF) using vLLM and Ray.
The script separates training and inference workloads onto distinct GPUs
so that Ray can manage process placement and inter-process communication.
A Hugging Face Transformer model occupies GPU 0 for training, whereas a
tensor-parallel vLLM inference engine occupies GPU 12.
The example performs the following steps:
* Load the training model on GPU 0.
* Split the inference model across GPUs 12 using vLLM's tensor parallelism
and Ray placement groups.
* Generate text from a list of prompts using the inference engine.
* Update the weights of the training model and broadcast the updated weights
to the inference engine by using a Ray collective RPC group. Note that
for demonstration purposes we simply zero out the weights.
For a production-ready implementation that supports multiple training and
inference replicas, see the OpenRLHF framework:
https://github.com/OpenRLHF/OpenRLHF
This example assumes a single-node cluster with three GPUs, but Ray
supports multi-node clusters. vLLM expects the GPUs are only used for vLLM
workloads. Residual GPU activity interferes with vLLM memory profiling and
causes unexpected behavior.
"""
import os
import ray
import torch
from ray.util.placement_group import placement_group
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
from rlhf_utils import stateless_init_process_group
from transformers import AutoModelForCausalLM
from vllm import LLM, SamplingParams
from vllm.utils.network_utils import get_ip, get_open_port
class MyLLM(LLM):
"""Configure the vLLM worker for Ray placement group execution."""
def __init__(self, *args, **kwargs):
# Remove the top-level CUDA_VISIBLE_DEVICES variable set by Ray
# so that vLLM can manage its own device placement within the worker.
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
super().__init__(*args, **kwargs)
# Load the OPT-125M model onto GPU 0 for the training workload.
train_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
train_model.to("cuda:0")
# Initialize Ray and set the visible devices. The vLLM engine will
# be placed on GPUs 1 and 2.
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
ray.init()
# Create a placement group that reserves GPU 12 for the vLLM inference engine.
# Learn more about Ray placement groups:
# https://docs.ray.io/en/latest/ray-core/scheduling/placement-group.html
pg_inference = placement_group([{"GPU": 1, "CPU": 0}] * 2)
ray.get(pg_inference.ready())
scheduling_inference = PlacementGroupSchedulingStrategy(
placement_group=pg_inference,
placement_group_capture_child_tasks=True,
placement_group_bundle_index=0,
)
# Launch the vLLM inference engine. The `enforce_eager` flag reduces
# start-up latency.
llm = ray.remote(
num_cpus=0,
num_gpus=0,
scheduling_strategy=scheduling_inference,
)(MyLLM).remote(
model="facebook/opt-125m",
enforce_eager=True,
worker_extension_cls="rlhf_utils.WorkerExtension",
tensor_parallel_size=2,
distributed_executor_backend="ray",
)
# Generate text from the prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0)
outputs = ray.get(llm.generate.remote(prompts, sampling_params))
print("-" * 50)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-" * 50)
# Set up the communication channel between the training process and the
# inference engine.
master_address = get_ip()
master_port = get_open_port()
handle = llm.collective_rpc.remote(
"init_weight_update_group", args=(master_address, master_port, 1, 3)
)
model_update_group = stateless_init_process_group(
master_address, master_port, 0, 3, torch.device("cuda:0")
)
ray.get(handle)
# Simulate a training step by zeroing out all model weights.
# In a real RLHF training loop the weights would be updated using the gradient
# from an RL objective such as PPO on a reward model.
for name, p in train_model.named_parameters():
p.data.zero_()
# Synchronize the updated weights to the inference engine.
for name, p in train_model.named_parameters():
dtype_name = str(p.dtype).split(".")[-1]
handle = llm.collective_rpc.remote(
"update_weight", args=(name, dtype_name, p.shape)
)
model_update_group.broadcast(p, src=0, stream=torch.cuda.current_stream())
ray.get(handle)
# Verify that the inference weights have been updated.
assert all(ray.get(llm.collective_rpc.remote("check_weights_changed")))
# Generate text with the updated model. The output is expected to be nonsense
# because the weights are zero.
outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params))
print("-" * 50)
for output in outputs_updated:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-" * 50)

View File

@@ -0,0 +1,256 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Demonstrates how to co-locate a vLLM inference worker and training
actors on the same set of GPUs for reinforcement learning from human feedback
(RLHF) workloads.
Ray serves as the distributed execution framework in this example. Ray
placement groups allocate both training actors and vLLM workers to the
same GPU bundles, enabling fast, in-GPU communication between the two
components.
The script shows how to do the following:
* Configure environment variables (`VLLM_RAY_PER_WORKER_GPUS` and
`VLLM_RAY_BUNDLE_INDICES`) so that vLLM workers land on the desired
devices.
* Exchange tensors between processes by means of CUDA inter-process
communication (IPC). CUDA IPC sidesteps NCCL limitations that occur
when multiple processes share a single GPU.
Note that this example assumes a single-node cluster with four GPUs, but Ray
supports multi-node clusters. vLLM expects exclusive use of the GPUs during
its initialization for memory profiling. Residual GPU activity interferes
with vLLM memory profiling and causes unexpected behavior.
Learn more about Ray placement groups:
https://docs.ray.io/en/latest/placement-groups.html
"""
import gc
import os
import sys
import ray
import torch
import zmq
from ray.util.placement_group import placement_group
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
from torch.multiprocessing.reductions import reduce_tensor
from vllm import LLM
if torch.version.hip is not None:
print("Skipping test for ROCm. Ray is unsupported on vLLM ROCm.")
sys.exit(0)
class MyLLM(LLM):
"""Configure the vLLM worker for Ray placement group execution.
The constructor sets environment variables that allow multiple vLLM
workers to share a single physical GPU and that encode the bundle
indices assigned by the placement group.
Args:
*args: Positional arguments forwarded to `vllm.LLM`.
bundle_indices (list[int]): Placement-group bundle indices
assigned to this worker.
**kwargs: Keyword arguments forwarded to `vllm.LLM`.
"""
def __init__(self, *args, bundle_indices: list[int], **kwargs):
# Prevent Ray from manipulating the top-level CUDA_VISIBLE_DEVICES variable
# so that vLLM can its own device placement inside the worker.
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
# Each worker uses 0.4 GPU so that two instances fit on the same GPUs.
os.environ["VLLM_RAY_PER_WORKER_GPUS"] = "0.4"
os.environ["VLLM_RAY_BUNDLE_INDICES"] = ",".join(map(str, bundle_indices))
print(f"creating LLM with bundle_indices={bundle_indices}")
super().__init__(*args, **kwargs)
class RayTrainingActor:
"""Training actor that hosts a Facebook OPT-125M model from Hugging Face.
The model is loaded onto the first GPU assigned to this actor, and expose
the CUDA IPC handles so that colocated vLLM workers can map tensors
directly.
"""
def __init__(self):
# Ray sets CUDA_VISIBLE_DEVICES to the GPUs assigned to this actor.
from transformers import AutoModelForCausalLM
self.model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
self.model.to("cuda:0")
# Zero out all the parameters.
for name, p in self.model.named_parameters():
p.data.zero_()
torch.accelerator.synchronize()
# The argument for `get_device_uuid` is the index of the GPU in the
# list of visible devices.
from vllm.platforms import current_platform
self.device_uuid = current_platform.get_device_uuid(0)
self.zmq_context = zmq.Context()
self.zmq_address_counter = 0
self.zmq_handle = None
def report_device_id(self) -> str:
return self.device_uuid
def get_zmq_handles(self) -> dict[str, str]:
suffix = f"{self.device_uuid}-{self.zmq_address_counter}"
self.zmq_handle = f"ipc:///tmp/rl-colocate-zmq-{suffix}.sock"
self.zmq_address_counter += 1
return {self.device_uuid: self.zmq_handle}
def update_weights(self):
# align size to avoid misaligned address
align_size = 256
def get_size(p: torch.Tensor) -> int:
return (p.nbytes + align_size - 1) // align_size * align_size
named_parameters: dict[str, torch.nn.Parameter] = dict(
self.model.named_parameters()
)
max_tensor_size = max(get_size(p) for p in named_parameters.values())
# use max_tensor_size * 2 as buffer size
buffer = torch.empty(max_tensor_size * 2, dtype=torch.uint8, device="cuda:0")
s = self.zmq_context.socket(zmq.REQ)
s.bind(self.zmq_handle)
handle = reduce_tensor(buffer)
offset = 0
buckets: list[tuple[list[dict], list[torch.Tensor]]] = []
named_tensors: list[dict] = []
real_tensors: list[torch.Tensor] = []
for name, p in named_parameters.items():
size = get_size(p)
if offset + size > buffer.numel():
buckets.append((named_tensors, real_tensors))
named_tensors, real_tensors = [], []
offset = 0
# assume tensors are contiguous
named_tensors.append(
{"name": name, "dtype": p.dtype, "shape": p.shape, "offset": offset}
)
real_tensors.append(p)
offset += size
if named_tensors:
buckets.append((named_tensors, real_tensors))
s.send_pyobj(handle)
s.recv()
for named_tensors, real_tensors in buckets:
offset = 0
for p in real_tensors:
buffer[offset : offset + p.nbytes].data.copy_(
p.data.view(-1).view(dtype=torch.uint8), non_blocking=True
)
offset += get_size(p)
torch.accelerator.synchronize()
s.send_pyobj(named_tensors)
s.recv()
s.send_pyobj(None)
s.recv()
s.close()
del buffer
gc.collect()
torch.accelerator.empty_cache()
# Ray manages four GPUs.
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
ray.init()
# Co-locate vLLM instances and training actors on the same set of GPUs:
# * GPU 0 and 1: training actor 0, training actor 1, and vLLM instance 0
# (tensor parallelism = 2).
# * GPU 2 and 3: training actor 2, training actor 3, and vLLM instance 1
# (tensor parallelism = 2).
pg = placement_group([{"GPU": 1, "CPU": 0}] * 4)
ray.get(pg.ready())
print(f"placement group has bundles {pg.bundle_specs=}")
training_actors = []
training_actor_device_ids = []
inference_engines = []
inference_engine_device_ids = []
for bundle_index in [0, 1, 2, 3]:
training_actor = ray.remote(
num_cpus=0,
num_gpus=0.4,
scheduling_strategy=PlacementGroupSchedulingStrategy(
placement_group=pg,
placement_group_capture_child_tasks=True,
placement_group_bundle_index=bundle_index,
),
)(RayTrainingActor).remote()
training_actors.append(training_actor)
for bundle_index, training_actor in enumerate(training_actors):
device_id = ray.get(training_actor.report_device_id.remote())
print(f"training actor {bundle_index} is on {device_id}")
training_actor_device_ids.append(device_id)
for i, bundle_indices in enumerate([[0, 1], [2, 3]]):
# Use the following syntax instead of the @ray.remote decorator so that
# the placement group is customized for each bundle.
llm = ray.remote(
num_cpus=0,
num_gpus=0,
scheduling_strategy=PlacementGroupSchedulingStrategy(
placement_group=pg,
placement_group_capture_child_tasks=True,
),
)(MyLLM).remote(
model="facebook/opt-125m",
enforce_eager=True,
worker_extension_cls="rlhf_utils.ColocateWorkerExtension",
tensor_parallel_size=2,
distributed_executor_backend="ray",
gpu_memory_utilization=0.4,
bundle_indices=bundle_indices,
)
inference_engines.append(llm)
# Do not call any method on the inference engine at this point; the call
# blocks until the vLLM instance finishes initialization.
for i, llm in enumerate(inference_engines):
inference_engine_device_ids.append(
ray.get(llm.collective_rpc.remote("report_device_id", args=tuple()))
)
print(f"inference engine {i} is on {inference_engine_device_ids[-1]}")
# Verify placement: the first two training actors share the same GPUs as
# the first inference engine.
assert training_actor_device_ids[:2] == inference_engine_device_ids[0]
# Verify placement: the last two training actors share the same GPUs as
# the second inference engine.
assert training_actor_device_ids[2:] == inference_engine_device_ids[1]
print("Gather all the ZMQ handles from the training actors.")
zmq_handles = {}
for actor in training_actors:
zmq_handles.update(ray.get(actor.get_zmq_handles.remote()))
print(f"ZMQ handles: {zmq_handles}")
print("Update the weights of the inference engines.")
ray.get(
[actor.update_weights.remote() for actor in training_actors]
+ [
llm.collective_rpc.remote("update_weights_from_ipc", args=(zmq_handles,))
for llm in inference_engines
]
)
print("Check if the weights are updated.")
for llm in inference_engines:
assert ray.get(llm.collective_rpc.remote("check_weights_changed", args=tuple()))

View File

@@ -0,0 +1,162 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Demonstrates reinforcement learning from human feedback (RLHF) using vLLM and Ray.
The script separates training and inference workloads onto distinct GPUs
so that Ray can manage process placement and inter-process communication.
A Hugging Face Transformer model occupies GPU 0 for training, whereas a
tensor-parallel vLLM inference engine occupies GPU 12.
The example performs the following steps:
* Load the training model on GPU 0.
* Split the inference model across GPUs 12 using vLLM's tensor parallelism
and Ray placement groups.
* Generate text from a list of prompts using the inference engine.
* Update the weights of the training model and broadcast the updated weights
to the inference engine by using a Ray collective RPC group. Note that
for demonstration purposes we simply zero out the weights.
For a production-ready implementation that supports multiple training and
inference replicas, see the OpenRLHF framework:
https://github.com/OpenRLHF/OpenRLHF
This example assumes a single-node cluster with three GPUs, but Ray
supports multi-node clusters. vLLM expects the GPUs are only used for vLLM
workloads. Residual GPU activity interferes with vLLM memory profiling and
causes unexpected behavior.
"""
import json
import os
import ray
import torch
from ray.util.placement_group import placement_group
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
from rlhf_utils import stateless_init_process_group
from torchao.core.config import config_to_dict
from torchao.quantization import (
Float8DynamicActivationFloat8WeightConfig,
PerRow,
)
from transformers import AutoModelForCausalLM
from vllm import LLM, SamplingParams
from vllm.utils.network_utils import get_ip, get_open_port
class MyLLM(LLM):
"""Configure the vLLM worker for Ray placement group execution."""
def __init__(self, *args, **kwargs):
# Remove the top-level CUDA_VISIBLE_DEVICES variable set by Ray
# so that vLLM can manage its own device placement within the worker.
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
super().__init__(*args, **kwargs)
# Load the OPT-125M model onto GPU 0 for the training workload.
train_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
train_model.to("cuda:0")
# Initialize Ray and set the visible devices. The vLLM engine will
# be placed on GPUs 1 and 2.
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
ray.init()
# Create a placement group that reserves GPU 12 for the vLLM inference engine.
# Learn more about Ray placement groups:
# https://docs.ray.io/en/latest/ray-core/scheduling/placement-group.html
pg_inference = placement_group([{"GPU": 1, "CPU": 0}] * 2)
ray.get(pg_inference.ready())
scheduling_inference = PlacementGroupSchedulingStrategy(
placement_group=pg_inference,
placement_group_capture_child_tasks=True,
placement_group_bundle_index=0,
)
# Launch the vLLM inference engine. The `enforce_eager` flag reduces
# start-up latency.
# generate torchao quantization config for RL rollout
# see https://github.com/vllm-project/vllm/pull/23014 for instructions to
# use serialized config files instead of passing around json string
config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
json_str = json.dumps(config_to_dict(config))
llm = ray.remote(
num_cpus=0,
num_gpus=0,
scheduling_strategy=scheduling_inference,
)(MyLLM).remote(
model="facebook/opt-125m",
hf_overrides={"quantization_config_dict_json": json_str},
enforce_eager=True,
worker_extension_cls="rlhf_utils.WorkerExtension",
tensor_parallel_size=2,
distributed_executor_backend="ray",
)
# Generate text from the prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0)
outputs = ray.get(llm.generate.remote(prompts, sampling_params))
print("-" * 50)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-" * 50)
# Set up the communication channel between the training process and the
# inference engine.
master_address = get_ip()
master_port = get_open_port()
handle = llm.collective_rpc.remote(
"init_weight_update_group", args=(master_address, master_port, 1, 3)
)
model_update_group = stateless_init_process_group(
master_address, master_port, 0, 3, torch.device("cuda:0")
)
ray.get(handle)
# Simulate a training step by zeroing out all model weights.
# In a real RLHF training loop the weights would be updated using the gradient
# from an RL objective such as PPO on a reward model.
for name, p in train_model.named_parameters():
p.data.zero_()
# Synchronize the updated weights to the inference engine.
for name, p in train_model.named_parameters():
dtype_name = str(p.dtype).split(".")[-1]
handle = llm.collective_rpc.remote(
"update_weight", args=(name, dtype_name, p.shape)
)
model_update_group.broadcast(p, src=0, stream=torch.cuda.current_stream())
ray.get(handle)
# Verify that the inference weights have been updated.
assert all(ray.get(llm.collective_rpc.remote("check_weights_changed")))
# Generate text with the updated model. The output is expected to be nonsense
# because the weights are zero.
outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params))
print("-" * 50)
for output in outputs_updated:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-" * 50)

Some files were not shown because too many files have changed in this diff Show More