Compare commits

...

10 Commits

Author SHA1 Message Date
Elvir Crnčević
89138b21cc [Bugfix] Zero-init MLA attention output buffers to prevent NaN from CUDA graph padding (#37442)
Signed-off-by: Elvir Crncevic <elvircrn@gmail.com>
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
Co-authored-by: Matthew Bonanni <mbonanni@redhat.com>
(cherry picked from commit ef2c4f778d)
2026-03-18 18:44:16 -07:00
JartX
6edd43de3c [Bugfix][ROCm] Fix worker startup OOM on ROCm by skipping unreliable cudagraph memory profiling (#36720)
Signed-off-by: JartX <sagformas@epdcenter.es>
(cherry picked from commit e8f9dbc369)
2026-03-18 18:43:52 -07:00
Andreas Karatzas
16c971dbc7 [CI] Fix PaddleOCR-VL HF test failure due to create_causal_mask API rename (#37328)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
(cherry picked from commit eaf7c9b976)
2026-03-18 11:04:33 -07:00
khluu
262ddd0d81 [cherry-pick][Bugfix] Fix EP weight filter breaking EPLB and NVFP4 accuracy #37322
Signed-off-by: khluu <khluu000@gmail.com>
2026-03-18 01:48:32 -07:00
Li, Jiang
e60c1674b3 [Bugfix] Avoid OpenMP thread reallocation in CPU torch compile (#37391)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
(cherry picked from commit 261801242f)
2026-03-18 01:41:42 -07:00
Roy Wang
faa80947f5 [Performance] Add --enable-ep-weight-filter CLI option (#37351)
Signed-off-by: esmeetu <jasonailu87@gmail.com>
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
(cherry picked from commit 761e0aa7a0)
2026-03-18 01:41:25 -07:00
Terry Gao
eeabf740bb [Custom Ops] Add functional + out variant for scaled_fp4_quant (#34389)
Signed-off-by: tianrengao <terrygao87@gmail.com>
(cherry picked from commit 3e6a1e1686)
2026-03-18 01:41:09 -07:00
Elvir Crnčević
cdcffafef8 Fix eplb nvfp4 experts hook (#37217)
Signed-off-by: Elvir Crncevic <elvircrn@gmail.com>
Signed-off-by: Elvir Crncevic <elvir@anthropic.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
(cherry picked from commit fd4d96302a)
2026-03-18 01:40:57 -07:00
Walter Beller-Morales
4d22667c32 [Feature][Frontend] add support for Cohere Embed v2 API (#37074)
Signed-off-by: walterbm <walter.beller.morales@gmail.com>
(cherry picked from commit 061980c36a)
2026-03-16 22:05:47 -07:00
Andreas Karatzas
1fe3932c8b [ROCm] Fix AttributeError for torch.compiler.skip_all_guards_unsafe on older PyTorch (#37219)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
(cherry picked from commit 54a62a79f7)
2026-03-16 21:03:49 -07:00
49 changed files with 2005 additions and 114 deletions

View File

@@ -295,10 +295,14 @@ void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,
std::vector<torch::Tensor> cutlass_sparse_compress(torch::Tensor const& a);
void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
torch::Tensor& output_scale,
torch::Tensor const& input_scale,
bool is_sf_swizzled_layout);
std::tuple<torch::Tensor, torch::Tensor> scaled_fp4_quant_func(
torch::Tensor const& input, torch::Tensor const& input_scale,
bool is_sf_swizzled_layout);
void scaled_fp4_quant_out(torch::Tensor const& input,
torch::Tensor const& input_scale,
bool is_sf_swizzled_layout, torch::Tensor& output,
torch::Tensor& output_scale);
void scaled_fp4_experts_quant(
torch::Tensor& output, torch::Tensor& output_scale,

View File

@@ -16,6 +16,8 @@
#include <torch/all.h>
#include "nvfp4_utils.cuh"
#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
(defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
@@ -51,9 +53,10 @@ void silu_and_mul_scaled_fp4_experts_quant_sm1xxa(
torch::Tensor const& output_scale_offset_by_experts);
#endif
void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
torch::Tensor& output_sf, torch::Tensor const& input_sf,
bool is_sf_swizzled_layout) {
void scaled_fp4_quant_out(torch::Tensor const& input,
torch::Tensor const& input_sf,
bool is_sf_swizzled_layout, torch::Tensor& output,
torch::Tensor& output_sf) {
#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
(defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
return scaled_fp4_quant_sm1xxa(output, input, output_sf, input_sf,
@@ -62,6 +65,34 @@ void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 quantization kernel");
}
std::tuple<torch::Tensor, torch::Tensor> scaled_fp4_quant_func(
torch::Tensor const& input, torch::Tensor const& input_sf,
bool is_sf_swizzled_layout) {
int64_t n = input.size(-1);
int64_t m = input.numel() / n;
auto device = input.device();
// Two fp4 values packed into a uint8
auto output = torch::empty(
{m, n / 2}, torch::TensorOptions().device(device).dtype(torch::kUInt8));
torch::Tensor output_sf;
if (is_sf_swizzled_layout) {
auto [sf_m, sf_n] = vllm::computeSwizzledSFShape(m, n);
output_sf = torch::empty(
{sf_m, sf_n},
torch::TensorOptions().device(device).dtype(torch::kInt32));
} else {
output_sf = torch::empty(
{m, n / CVT_FP4_SF_VEC_SIZE},
torch::TensorOptions().device(device).dtype(torch::kUInt8));
}
scaled_fp4_quant_out(input, input_sf, is_sf_swizzled_layout, output,
output_sf);
return {output, output_sf};
}
void scaled_fp4_experts_quant(
torch::Tensor& output, torch::Tensor& output_scale,
torch::Tensor const& input, torch::Tensor const& input_global_scale,

View File

@@ -18,6 +18,7 @@
#include <cuda_runtime.h>
#include <cuda_fp8.h>
#include <utility>
#include "../../cuda_vec_utils.cuh"
@@ -54,6 +55,18 @@ inline int computeEffectiveRows(int m) {
return round_up(m, ROW_TILE);
}
// Compute the shape of the swizzled SF output tensor.
// Returns (rounded_m, rounded_n / 4) where:
// rounded_m = round_up(m, 128)
// rounded_n = round_up(n / CVT_FP4_SF_VEC_SIZE, 4)
inline std::pair<int64_t, int64_t> computeSwizzledSFShape(int64_t m,
int64_t n) {
int64_t rounded_m = round_up(m, static_cast<int64_t>(128));
int64_t scale_n = n / CVT_FP4_SF_VEC_SIZE;
int64_t rounded_n = round_up(scale_n, static_cast<int64_t>(4));
return {rounded_m, rounded_n / 4};
}
// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
inline __device__ uint32_t fp32_vec8_to_e2m1(float (&array)[8]) {
uint32_t val;

View File

@@ -564,10 +564,21 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// Compute NVFP4 block quantized tensor.
ops.def(
"scaled_fp4_quant(Tensor! output, Tensor input,"
" Tensor! output_scale, Tensor input_scale, bool "
"is_sf_swizzled_layout) -> ()");
ops.impl("scaled_fp4_quant", torch::kCUDA, &scaled_fp4_quant);
"scaled_fp4_quant(Tensor input,"
" Tensor input_scale, bool "
"is_sf_swizzled_layout) -> (Tensor, Tensor)");
ops.impl("scaled_fp4_quant", torch::kCUDA, &scaled_fp4_quant_func);
// Out variant
// TODO: Add {at::Tag::out_variant} tag and update all call sites
// to use the functional variant once vLLM upgrades PyTorch.
// See pytorch/pytorch#176117.
ops.def(
"scaled_fp4_quant.out(Tensor input,"
" Tensor input_scale, bool "
"is_sf_swizzled_layout, *, Tensor(a!) output, Tensor(b!) output_scale) "
"-> ()");
ops.impl("scaled_fp4_quant.out", torch::kCUDA, &scaled_fp4_quant_out);
// Compute NVFP4 experts quantization.
ops.def(

View File

@@ -72,6 +72,9 @@ In addition, we have the following custom APIs:
- Only applicable to [classification models](../models/pooling_models.md).
- [Score API](#score-api) (`/score`)
- Applicable to [embedding models and cross-encoder models](../models/pooling_models.md).
- [Cohere Embed API](#cohere-embed-api) (`/v2/embed`)
- Compatible with [Cohere's Embed API](https://docs.cohere.com/reference/embed)
- Works with any [embedding model](../models/pooling_models.md), including multimodal models.
- [Re-rank API](#re-rank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`)
- Implements [Jina AI's v1 re-rank API](https://jina.ai/reranker/)
- Also compatible with [Cohere's v1 & v2 re-rank APIs](https://docs.cohere.com/v2/reference/rerank)
@@ -429,6 +432,137 @@ these extra parameters are supported instead:
--8<-- "vllm/entrypoints/pooling/base/protocol.py:embed-extra-params"
```
### Cohere Embed API
Our API is also compatible with [Cohere's Embed v2 API](https://docs.cohere.com/reference/embed) which adds support for some modern embedding feature such as truncation, output dimensions, embedding types, and input types. This endpoint works with any embedding model (including multimodal models).
#### Cohere Embed API request parameters
| Parameter | Type | Required | Description |
| --------- | ---- | -------- | ----------- |
| `model` | string | Yes | Model name |
| `input_type` | string | No | Prompt prefix key (model-dependent, see below) |
| `texts` | list[string] | No | Text inputs (use one of `texts`, `images`, or `inputs`) |
| `images` | list[string] | No | Base64 data URI images |
| `inputs` | list[object] | No | Mixed text and image content objects |
| `embedding_types` | list[string] | No | Output types (default: `["float"]`) |
| `output_dimension` | int | No | Truncate embeddings to this dimension (Matryoshka) |
| `truncate` | string | No | `END`, `START`, or `NONE` (default: `END`) |
#### Text embedding
```bash
curl -X POST "http://localhost:8000/v2/embed" \
-H "Content-Type: application/json" \
-d '{
"model": "Snowflake/snowflake-arctic-embed-m-v1.5",
"input_type": "query",
"texts": ["Hello world", "How are you?"],
"embedding_types": ["float"]
}'
```
??? console "Response"
```json
{
"id": "embd-...",
"embeddings": {
"float": [
[0.012, -0.034, ...],
[0.056, 0.078, ...]
]
},
"texts": ["Hello world", "How are you?"],
"meta": {
"api_version": {"version": "2"},
"billed_units": {"input_tokens": 12}
}
}
```
#### Mixed text and image inputs
For multimodal models, you can embed images by passing base64 data URIs. The `inputs` field accepts a list of objects with mixed text and image content:
```bash
curl -X POST "http://localhost:8000/v2/embed" \
-H "Content-Type: application/json" \
-d '{
"model": "google/siglip-so400m-patch14-384",
"inputs": [
{
"content": [
{"type": "text", "text": "A photo of a cat"},
{"type": "image_url", "image_url": {"url": "data:image/png;base64,iVBOR..."}}
]
}
],
"embedding_types": ["float"]
}'
```
#### Embedding types
The `embedding_types` parameter controls the output format. Multiple types can be requested in a single call:
| Type | Description |
| ---- | ----------- |
| `float` | Raw float32 embeddings (default) |
| `binary` | Bit-packed signed binary |
| `ubinary` | Bit-packed unsigned binary |
| `base64` | Little-endian float32 encoded as base64 |
```bash
curl -X POST "http://localhost:8000/v2/embed" \
-H "Content-Type: application/json" \
-d '{
"model": "Snowflake/snowflake-arctic-embed-m-v1.5",
"input_type": "query",
"texts": ["What is machine learning?"],
"embedding_types": ["float", "binary"]
}'
```
??? console "Response"
```json
{
"id": "embd-...",
"embeddings": {
"float": [[0.012, -0.034, ...]],
"binary": [[42, -117, ...]]
},
"texts": ["What is machine learning?"],
"meta": {
"api_version": {"version": "2"},
"billed_units": {"input_tokens": 8}
}
}
```
#### Truncation
The `truncate` parameter controls how inputs exceeding the model's maximum sequence length are handled:
| Value | Behavior |
| ----- | --------- |
| `END` (default) | Keep the first tokens, drop the end |
| `START` | Keep the last tokens, drop the beginning |
| `NONE` | Return an error if the input is too long |
#### Input type and prompt prefixes
The `input_type` field selects a prompt prefix to prepend to each text input. The available values
depend on the model:
- **Models with `task_instructions` in `config.json`**: The keys from the `task_instructions` dict are
the valid `input_type` values and the corresponding value is prepended to each text.
- **Models with `config_sentence_transformers.json` prompts**: The keys from the `prompts` dict are
the valid `input_type` values. For example, `Snowflake/snowflake-arctic-embed-xs` defines `"query"`,
so setting `input_type: "query"` prepends `"Represent this sentence for searching relevant passages: "`.
- **Other models**: `input_type` is not accepted and will raise a validation error if passed.
### Transcriptions API
Our Transcriptions API is compatible with [OpenAI's Transcriptions API](https://platform.openai.com/docs/api-reference/audio/createTranscription);

View File

@@ -179,7 +179,7 @@ class TestAllReduceFusedAddRMSNormStaticQuantFP4Model(torch.nn.Module):
def ops_in_model_before(self):
return [
torch.ops.vllm.all_reduce.default,
torch.ops._C.scaled_fp4_quant.default,
torch.ops._C.scaled_fp4_quant.out,
]

View File

@@ -0,0 +1,310 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the Cohere /v2/embed API with generic (non-Cohere) models.
Validates that the Cohere v2 embed endpoint works correctly with standard
embedding models, covering text embedding, embedding type conversions,
response structure, batching, normalisation, and semantic similarity.
"""
import base64
import struct
import numpy as np
import pytest
import requests
from tests.utils import RemoteOpenAIServer
DTYPE = "bfloat16"
MODELS: list[tuple[str, list[str]]] = [
("intfloat/multilingual-e5-small", []),
(
"Snowflake/snowflake-arctic-embed-m-v1.5",
[
"--trust_remote_code",
"--hf_overrides",
'{"matryoshka_dimensions":[256]}',
],
),
]
@pytest.fixture(scope="module", params=MODELS, ids=lambda m: m[0])
def model_config(request):
return request.param
@pytest.fixture(scope="module")
def model_name(model_config):
return model_config[0]
@pytest.fixture(scope="module")
def server(model_config):
name, extra_args = model_config
args = [
"--runner",
"pooling",
"--dtype",
DTYPE,
"--enforce-eager",
"--max-model-len",
"512",
"--gpu-memory-utilization",
"0.02",
] + extra_args
with RemoteOpenAIServer(name, args) as remote_server:
yield remote_server
def _cohere_embed(
server: RemoteOpenAIServer,
model_name: str,
texts: list[str] | None = None,
images: list[str] | None = None,
input_type: str | None = None,
embedding_types: list[str] | None = None,
) -> dict:
body: dict = {"model": model_name}
if input_type is not None:
body["input_type"] = input_type
if texts is not None:
body["texts"] = texts
if images is not None:
body["images"] = images
if embedding_types is not None:
body["embedding_types"] = embedding_types
resp = requests.post(server.url_for("/v2/embed"), json=body)
resp.raise_for_status()
return resp.json()
def _openai_embed(
server: RemoteOpenAIServer, model_name: str, texts: list[str]
) -> dict:
body = {"model": model_name, "input": texts, "encoding_format": "float"}
resp = requests.post(server.url_for("/v1/embeddings"), json=body)
resp.raise_for_status()
return resp.json()
def _cosine_sim(a: list[float], b: list[float]) -> float:
va, vb = np.array(a), np.array(b)
return float(np.dot(va, vb) / (np.linalg.norm(va) * np.linalg.norm(vb)))
# -----------------------------------------------------------
# Text embedding tests
# -----------------------------------------------------------
def test_basic_embed(server: RemoteOpenAIServer, model_name: str):
r = _cohere_embed(
server, model_name, texts=["hello world"], embedding_types=["float"]
)
assert "embeddings" in r
assert len(r["embeddings"]["float"]) == 1
assert len(r["embeddings"]["float"][0]) > 0
def test_unsupported_input_type_rejected(server: RemoteOpenAIServer, model_name: str):
"""An input_type not defined in the model's prompt config should be
rejected with a 400 error."""
body = {
"model": model_name,
"input_type": "nonexistent_type",
"texts": ["hello world"],
"embedding_types": ["float"],
}
resp = requests.post(server.url_for("/v2/embed"), json=body)
assert resp.status_code == 400
assert "Unsupported input_type" in resp.json()["error"]["message"]
def test_omitted_input_type_accepted(server: RemoteOpenAIServer, model_name: str):
"""Omitting input_type should always work (no prompt prefix applied)."""
body = {
"model": model_name,
"texts": ["hello world"],
"embedding_types": ["float"],
}
resp = requests.post(server.url_for("/v2/embed"), json=body)
assert resp.status_code == 200
data = resp.json()
assert len(data["embeddings"]["float"]) == 1
def test_v1_v2_parity(server: RemoteOpenAIServer, model_name: str):
"""v1 (OpenAI) and v2 (Cohere) endpoints should produce the same
float embeddings for a generic model."""
texts = ["hello world"]
v2 = _cohere_embed(server, model_name, texts=texts, embedding_types=["float"])
v1 = _openai_embed(server, model_name, texts)
cos = _cosine_sim(v2["embeddings"]["float"][0], v1["data"][0]["embedding"])
assert cos > 0.9999, f"v1/v2 parity failed, cosine={cos}"
def test_embedding_types(server: RemoteOpenAIServer, model_name: str):
r = _cohere_embed(
server,
model_name,
texts=["test"],
embedding_types=["float", "binary", "ubinary"],
)
dim = len(r["embeddings"]["float"][0])
assert len(r["embeddings"]["binary"][0]) == dim // 8
assert len(r["embeddings"]["ubinary"][0]) == dim // 8
def test_response_structure(server: RemoteOpenAIServer, model_name: str):
r = _cohere_embed(server, model_name, texts=["test"], embedding_types=["float"])
assert "id" in r
assert "embeddings" in r
assert "texts" in r
assert r["texts"] == ["test"]
assert "meta" in r
assert r["meta"]["api_version"]["version"] == "2"
assert "billed_units" in r["meta"]
assert r["meta"]["billed_units"]["input_tokens"] > 0
assert r["meta"]["billed_units"]["image_tokens"] == 0
def test_batch(server: RemoteOpenAIServer, model_name: str):
texts = ["apple", "banana", "cherry"]
r = _cohere_embed(server, model_name, texts=texts, embedding_types=["float"])
assert len(r["embeddings"]["float"]) == 3
dim = len(r["embeddings"]["float"][0])
for emb in r["embeddings"]["float"]:
assert len(emb) == dim
def test_l2_normalized(server: RemoteOpenAIServer, model_name: str):
r = _cohere_embed(
server, model_name, texts=["hello world"], embedding_types=["float"]
)
emb = np.array(r["embeddings"]["float"][0])
assert abs(float(np.linalg.norm(emb)) - 1.0) < 0.01
def test_semantic_similarity(server: RemoteOpenAIServer, model_name: str):
r = _cohere_embed(
server,
model_name,
texts=["machine learning", "deep learning", "chocolate cake recipe"],
embedding_types=["float"],
)
embs = r["embeddings"]["float"]
cos_related = _cosine_sim(embs[0], embs[1])
cos_unrelated = _cosine_sim(embs[0], embs[2])
assert cos_related > cos_unrelated
def test_missing_input_returns_error(server: RemoteOpenAIServer, model_name: str):
body = {"model": model_name}
resp = requests.post(server.url_for("/v2/embed"), json=body)
assert resp.status_code == 400
def test_base64_embedding_type(server: RemoteOpenAIServer, model_name: str):
r = _cohere_embed(
server,
model_name,
texts=["test encoding"],
embedding_types=["float", "base64"],
)
float_emb = r["embeddings"]["float"][0]
b64_str = r["embeddings"]["base64"][0]
decoded = struct.unpack(f"<{len(float_emb)}f", base64.b64decode(b64_str))
np.testing.assert_allclose(float_emb, decoded, rtol=1e-5)
# -----------------------------------------------------------
# Truncation tests
# -----------------------------------------------------------
def _cohere_embed_raw(
server: RemoteOpenAIServer,
body: dict,
) -> requests.Response:
return requests.post(server.url_for("/v2/embed"), json=body)
def test_truncate_end_succeeds(server: RemoteOpenAIServer, model_name: str):
"""truncate=END should silently truncate long input."""
long_text = " ".join(["word"] * 2000)
body = {
"model": model_name,
"texts": [long_text],
"embedding_types": ["float"],
"truncate": "END",
}
resp = _cohere_embed_raw(server, body)
assert resp.status_code == 200
data = resp.json()
assert len(data["embeddings"]["float"]) == 1
def test_truncate_start_succeeds(server: RemoteOpenAIServer, model_name: str):
"""truncate=START should silently truncate long input from the start."""
long_text = " ".join(["word"] * 2000)
body = {
"model": model_name,
"texts": [long_text],
"embedding_types": ["float"],
"truncate": "START",
}
resp = _cohere_embed_raw(server, body)
assert resp.status_code == 200
data = resp.json()
assert len(data["embeddings"]["float"]) == 1
def test_truncate_none_rejects_long_input(server: RemoteOpenAIServer, model_name: str):
"""truncate=NONE should error when input exceeds model context."""
long_text = " ".join(["word"] * 2000)
body = {
"model": model_name,
"texts": [long_text],
"embedding_types": ["float"],
"truncate": "NONE",
}
resp = _cohere_embed_raw(server, body)
assert resp.status_code == 400
def test_truncate_start_vs_end_differ(server: RemoteOpenAIServer, model_name: str):
"""START and END truncation should produce different embeddings
when the input is long enough to actually be truncated.
We construct input with distinct tokens at the start vs end
so that keeping different halves produces different embeddings.
"""
start_words = " ".join([f"alpha{i}" for i in range(300)])
end_words = " ".join([f"omega{i}" for i in range(300)])
long_text = start_words + " " + end_words
body_end = {
"model": model_name,
"texts": [long_text],
"embedding_types": ["float"],
"truncate": "END",
}
body_start = {
"model": model_name,
"texts": [long_text],
"embedding_types": ["float"],
"truncate": "START",
}
r_end = _cohere_embed_raw(server, body_end).json()
r_start = _cohere_embed_raw(server, body_start).json()
emb_end = r_end["embeddings"]["float"][0]
emb_start = r_start["embeddings"]["float"][0]
cos = _cosine_sim(emb_end, emb_start)
assert cos < 0.99, (
f"START and END truncation should produce different embeddings "
f"for long input, but cosine similarity was {cos}"
)

View File

@@ -0,0 +1,135 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the Cohere /v2/embed API with a multimodal model (SigLIP).
Validates image embedding, batching, normalisation, and embedding type
conversions through the /v2/embed endpoint.
"""
import base64
import struct
import zlib
import numpy as np
import pytest
import requests
from tests.utils import RemoteOpenAIServer
MODEL_NAME = "google/siglip-so400m-patch14-384"
DTYPE = "bfloat16"
@pytest.fixture(scope="module")
def server():
args = [
"--runner",
"pooling",
"--dtype",
DTYPE,
"--enforce-eager",
"--max-model-len",
"64",
"--gpu-memory-utilization",
"0.3",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
def _make_tiny_png(r: int, g: int, b: int, w: int = 2, h: int = 2) -> str:
raw = b""
for _ in range(h):
raw += b"\x00" + bytes([r, g, b]) * w
compressed = zlib.compress(raw)
def chunk(ctype: bytes, cdata: bytes) -> bytes:
c = ctype + cdata
return (
struct.pack(">I", len(cdata))
+ c
+ struct.pack(">I", zlib.crc32(c) & 0xFFFFFFFF)
)
ihdr = struct.pack(">IIBBBBB", w, h, 8, 2, 0, 0, 0)
png = (
b"\x89PNG\r\n\x1a\n"
+ chunk(b"IHDR", ihdr)
+ chunk(b"IDAT", compressed)
+ chunk(b"IEND", b"")
)
return "data:image/png;base64," + base64.b64encode(png).decode()
def _cohere_embed(
server: RemoteOpenAIServer,
texts: list[str] | None = None,
images: list[str] | None = None,
embedding_types: list[str] | None = None,
) -> dict:
body: dict = {"model": MODEL_NAME}
if texts is not None:
body["texts"] = texts
if images is not None:
body["images"] = images
if embedding_types is not None:
body["embedding_types"] = embedding_types
resp = requests.post(server.url_for("/v2/embed"), json=body)
resp.raise_for_status()
return resp.json()
def test_image_embed(server: RemoteOpenAIServer):
img_uri = _make_tiny_png(255, 0, 0)
r = _cohere_embed(
server,
images=[img_uri],
embedding_types=["float"],
)
assert "embeddings" in r
assert len(r["embeddings"]["float"]) == 1
assert len(r["embeddings"]["float"][0]) > 0
assert r["meta"]["billed_units"]["image_tokens"] > 0
assert r["meta"]["billed_units"]["input_tokens"] == 0
def test_image_batch(server: RemoteOpenAIServer):
red = _make_tiny_png(255, 0, 0)
blue = _make_tiny_png(0, 0, 255)
r = _cohere_embed(
server,
images=[red, blue],
embedding_types=["float"],
)
assert len(r["embeddings"]["float"]) == 2
def test_image_l2_normalized(server: RemoteOpenAIServer):
img_uri = _make_tiny_png(0, 255, 0)
r = _cohere_embed(
server,
images=[img_uri],
embedding_types=["float"],
)
emb = np.array(r["embeddings"]["float"][0])
assert abs(float(np.linalg.norm(emb)) - 1.0) < 0.01
def test_image_embedding_types(server: RemoteOpenAIServer):
img_uri = _make_tiny_png(128, 128, 128)
r = _cohere_embed(
server,
images=[img_uri],
embedding_types=["float", "binary", "ubinary"],
)
dim = len(r["embeddings"]["float"][0])
assert len(r["embeddings"]["binary"][0]) == dim // 8
assert len(r["embeddings"]["ubinary"][0]) == dim // 8
def test_text_embed_on_multimodal(server: RemoteOpenAIServer):
"""SigLIP also supports text-only embedding via /v2/embed."""
r = _cohere_embed(server, texts=["hello world"], embedding_types=["float"])
assert "embeddings" in r
assert len(r["embeddings"]["float"]) == 1
assert len(r["embeddings"]["float"][0]) > 0

View File

@@ -0,0 +1,102 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Parity test between Cohere /v2/embed and OpenAI /v1/embeddings.
Verifies that both endpoints produce identical float embeddings when
no prompt prefix is applied (input_type omitted for Cohere /v2/embed).
"""
import numpy as np
import pytest
import requests
from tests.utils import RemoteOpenAIServer
MODEL_NAME = "BAAI/bge-base-en-v1.5"
DTYPE = "bfloat16"
@pytest.fixture(scope="module")
def server():
args = [
"--runner",
"pooling",
"--dtype",
DTYPE,
"--enforce-eager",
"--max-model-len",
"512",
"--gpu-memory-utilization",
"0.02",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
def _cohere_embed(
server: RemoteOpenAIServer,
texts: list[str],
) -> list[list[float]]:
body = {
"model": MODEL_NAME,
"texts": texts,
"embedding_types": ["float"],
}
resp = requests.post(server.url_for("/v2/embed"), json=body)
resp.raise_for_status()
return resp.json()["embeddings"]["float"]
def _openai_embed(
server: RemoteOpenAIServer,
texts: list[str],
) -> list[list[float]]:
body = {"model": MODEL_NAME, "input": texts, "encoding_format": "float"}
resp = requests.post(server.url_for("/v1/embeddings"), json=body)
resp.raise_for_status()
return [item["embedding"] for item in resp.json()["data"]]
def test_single_text_parity(server: RemoteOpenAIServer):
"""A single text should produce identical embeddings via both APIs."""
texts = ["the quick brown fox jumps over the lazy dog"]
v2 = _cohere_embed(server, texts)
v1 = _openai_embed(server, texts)
np.testing.assert_allclose(v2[0], v1[0], rtol=1e-5)
def test_batch_parity(server: RemoteOpenAIServer):
"""A batch of texts should produce identical embeddings via both APIs,
in the same order."""
texts = [
"machine learning",
"deep learning",
"natural language processing",
]
v2 = _cohere_embed(server, texts)
v1 = _openai_embed(server, texts)
assert len(v2) == len(v1) == 3
for i in range(3):
np.testing.assert_allclose(v2[i], v1[i], rtol=1e-5, err_msg=f"index {i}")
def test_token_count_parity(server: RemoteOpenAIServer):
"""Both APIs should report the same prompt token count."""
texts = ["hello world"]
v2_resp = requests.post(
server.url_for("/v2/embed"),
json={
"model": MODEL_NAME,
"texts": texts,
"embedding_types": ["float"],
},
)
v1_resp = requests.post(
server.url_for("/v1/embeddings"),
json={"model": MODEL_NAME, "input": texts, "encoding_format": "float"},
)
v2_resp.raise_for_status()
v1_resp.raise_for_status()
v2_tokens = v2_resp.json()["meta"]["billed_units"]["input_tokens"]
v1_tokens = v1_resp.json()["usage"]["prompt_tokens"]
assert v2_tokens == v1_tokens

View File

@@ -0,0 +1,208 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Unit tests for EmbedIOProcessor."""
import pytest
from vllm.entrypoints.pooling.embed.io_processor import EmbedIOProcessor
from vllm.entrypoints.pooling.embed.protocol import (
CohereEmbedRequest,
)
class TestResolveTruncation:
"""Unit tests for EmbedIOProcessor._resolve_cohere_truncation."""
@staticmethod
def _make_request(**kwargs) -> CohereEmbedRequest:
defaults = {
"model": "test",
"input_type": "search_document",
"texts": ["hello"],
}
return CohereEmbedRequest(**(defaults | kwargs))
def test_truncate_end_default(self):
req = self._make_request()
tokens, side = EmbedIOProcessor._resolve_cohere_truncation(req)
assert tokens == -1
assert side is None
def test_truncate_end_explicit(self):
req = self._make_request(truncate="END")
tokens, side = EmbedIOProcessor._resolve_cohere_truncation(req)
assert tokens == -1
assert side is None
def test_truncate_end_with_max_tokens(self):
req = self._make_request(truncate="END", max_tokens=128)
tokens, side = EmbedIOProcessor._resolve_cohere_truncation(req)
assert tokens == 128
assert side is None
def test_truncate_none(self):
req = self._make_request(truncate="NONE")
tokens, side = EmbedIOProcessor._resolve_cohere_truncation(req)
assert tokens is None
assert side is None
def test_truncate_none_with_max_tokens(self):
"""truncate=NONE should NOT set truncate_prompt_tokens; the
max_tokens limit is enforced separately via _check_max_tokens."""
req = self._make_request(truncate="NONE", max_tokens=10)
tokens, side = EmbedIOProcessor._resolve_cohere_truncation(req)
assert tokens is None
assert side is None
def test_truncate_start(self):
req = self._make_request(truncate="START")
tokens, side = EmbedIOProcessor._resolve_cohere_truncation(req)
assert tokens == -1
assert side == "left"
def test_truncate_start_with_max_tokens(self):
req = self._make_request(truncate="START", max_tokens=64)
tokens, side = EmbedIOProcessor._resolve_cohere_truncation(req)
assert tokens == 64
assert side == "left"
class TestApplyStPrompt:
"""Unit tests for EmbedIOProcessor._apply_task_instruction."""
@staticmethod
def _make_handler(task_instructions: dict[str, str] | None):
handler = object.__new__(EmbedIOProcessor)
handler.task_instructions = task_instructions
return handler
def test_no_prompts_configured(self):
handler = self._make_handler(None)
texts = ["hello", "world"]
assert handler._apply_task_instruction(texts, "query") is texts
def test_matching_input_type(self):
handler = self._make_handler({"query": "search_query: "})
result = handler._apply_task_instruction(["hello"], "query")
assert result == ["search_query: hello"]
def test_non_matching_input_type(self):
handler = self._make_handler({"query": "search_query: "})
texts = ["hello"]
assert handler._apply_task_instruction(texts, "document") is texts
def test_multiple_texts(self):
handler = self._make_handler(
{"query": "Represent this sentence for searching: "}
)
result = handler._apply_task_instruction(["a", "b", "c"], "query")
assert result == [
"Represent this sentence for searching: a",
"Represent this sentence for searching: b",
"Represent this sentence for searching: c",
]
def test_empty_prefix_returns_unchanged(self):
handler = self._make_handler({"passage": ""})
texts = ["hello"]
assert handler._apply_task_instruction(texts, "passage") is texts
class TestLoadTaskInstructions:
"""Unit tests for EmbedIOProcessor._load_task_instructions."""
def test_no_attribute(self):
class FakeConfig:
pass
assert EmbedIOProcessor._load_task_instructions(FakeConfig()) is None
def test_with_task_instructions(self):
class FakeConfig:
task_instructions = {
"retrieval.query": "Represent the query: ",
"retrieval.passage": "",
}
result = EmbedIOProcessor._load_task_instructions(FakeConfig())
assert result == {
"retrieval.query": "Represent the query: ",
"retrieval.passage": "",
}
def test_empty_dict(self):
class FakeConfig:
task_instructions = {}
assert EmbedIOProcessor._load_task_instructions(FakeConfig()) is None
def test_non_dict(self):
class FakeConfig:
task_instructions = "not a dict"
assert EmbedIOProcessor._load_task_instructions(FakeConfig()) is None
class TestCheckMaxTokens:
"""Unit tests for EmbedIOProcessor._check_cohere_max_tokens."""
@staticmethod
def _fake_output(n_tokens: int):
class _Out:
def __init__(self, n: int):
self.prompt_token_ids = list(range(n))
return _Out(n_tokens)
def test_none_check_is_noop(self):
outs = [self._fake_output(100)]
EmbedIOProcessor._check_cohere_max_tokens(outs, None)
def test_within_limit(self):
outs = [self._fake_output(5), self._fake_output(3)]
EmbedIOProcessor._check_cohere_max_tokens(outs, 5)
def test_exceeds_limit(self):
outs = [self._fake_output(3), self._fake_output(10)]
with pytest.raises(ValueError, match="exceeds max_tokens=5"):
EmbedIOProcessor._check_cohere_max_tokens(outs, 5)
def test_exact_limit(self):
outs = [self._fake_output(5)]
EmbedIOProcessor._check_cohere_max_tokens(outs, 5)
class TestValidateInputType:
"""Unit tests for EmbedIOProcessor._validate_input_type."""
@staticmethod
def _make_handler(task_instructions: dict[str, str] | None):
handler = object.__new__(EmbedIOProcessor)
handler.task_instructions = task_instructions
return handler
def test_none_input_type_always_accepted(self):
handler = self._make_handler(None)
handler._validate_input_type(None)
handler_with = self._make_handler({"query": "q: "})
handler_with._validate_input_type(None)
def test_no_prompts_rejects(self):
handler = self._make_handler(None)
with pytest.raises(ValueError, match="does not define any input_type"):
handler._validate_input_type("anything")
def test_known_type_accepted(self):
handler = self._make_handler({"query": "q: ", "document": "d: "})
handler._validate_input_type("query")
handler._validate_input_type("document")
def test_unknown_type_rejected(self):
handler = self._make_handler({"query": "q: ", "document": "d: "})
with pytest.raises(ValueError, match="Unsupported input_type 'other'"):
handler._validate_input_type("other")
def test_error_lists_supported(self):
handler = self._make_handler({"a": "", "b": ""})
with pytest.raises(ValueError, match="Supported values: a, b"):
handler._validate_input_type("z")

View File

@@ -0,0 +1,129 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Unit tests for Cohere embed protocol: build_typed_embeddings and its
underlying packing helpers, plus Cohere-specific serving helpers."""
import base64
import struct
import numpy as np
import pytest
from vllm.entrypoints.pooling.embed.protocol import (
build_typed_embeddings,
)
@pytest.fixture
def sample_embeddings() -> list[list[float]]:
return [
[0.1, -0.2, 0.3, -0.4, 0.5, -0.6, 0.7, -0.8],
[-0.05, 0.15, -0.25, 0.35, -0.45, 0.55, -0.65, 0.75],
]
class TestBuildTypedEmbeddingsFloat:
def test_float_passthrough(self, sample_embeddings: list[list[float]]):
result = build_typed_embeddings(sample_embeddings, ["float"])
assert result.float == sample_embeddings
assert result.binary is None
def test_empty_input(self):
result = build_typed_embeddings([], ["float"])
assert result.float == []
class TestBuildTypedEmbeddingsBinary:
def test_binary_packing(self):
# 8 values: positive->1, negative->0 => bits: 10101010 = 0xAA = 170
# signed: 170 - 128 = 42
embs = [[1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0]]
result = build_typed_embeddings(embs, ["binary"])
assert result.binary is not None
assert result.binary[0] == [42]
def test_ubinary_packing(self):
embs = [[1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0]]
result = build_typed_embeddings(embs, ["ubinary"])
assert result.ubinary is not None
assert result.ubinary[0] == [170] # 0b10101010
def test_binary_all_positive(self):
embs = [[0.1] * 8]
result = build_typed_embeddings(embs, ["binary"])
assert result.binary is not None
# all bits = 1 => 0xFF = 255, signed: 255 - 128 = 127
assert result.binary[0] == [127]
def test_binary_all_negative(self):
embs = [[-0.1] * 8]
result = build_typed_embeddings(embs, ["binary"])
assert result.binary is not None
# all bits = 0, signed: 0 - 128 = -128
assert result.binary[0] == [-128]
def test_binary_dimension_is_eighth(self, sample_embeddings: list[list[float]]):
result = build_typed_embeddings(sample_embeddings, ["binary"])
assert result.binary is not None
for orig, packed in zip(sample_embeddings, result.binary):
assert len(packed) == len(orig) // 8
def test_zero_treated_as_positive(self):
embs = [[0.0] * 8]
result = build_typed_embeddings(embs, ["binary"])
assert result.binary is not None
# 0.0 >= 0 is True, so bit=1 for all => 127 (signed)
assert result.binary[0] == [127]
def test_non_multiple_of_8_raises(self):
embs = [[0.1] * 7]
with pytest.raises(ValueError, match="multiple of 8"):
build_typed_embeddings(embs, ["binary"])
def test_ubinary_non_multiple_of_8_raises(self):
embs = [[0.1] * 10]
with pytest.raises(ValueError, match="multiple of 8"):
build_typed_embeddings(embs, ["ubinary"])
class TestBuildTypedEmbeddingsBase64:
def test_base64_roundtrip(self, sample_embeddings: list[list[float]]):
result = build_typed_embeddings(sample_embeddings, ["base64"])
assert result.base64 is not None
assert len(result.base64) == 2
for orig, b64_str in zip(sample_embeddings, result.base64):
decoded = base64.b64decode(b64_str)
n = len(orig)
values = struct.unpack(f"<{n}f", decoded)
np.testing.assert_allclose(orig, values, rtol=1e-5)
def test_base64_byte_length(self):
embs = [[0.1, 0.2, 0.3]]
result = build_typed_embeddings(embs, ["base64"])
assert result.base64 is not None
raw = base64.b64decode(result.base64[0])
assert len(raw) == 3 * 4 # 3 floats * 4 bytes each
class TestBuildTypedEmbeddingsMultiple:
def test_all_types_at_once(self, sample_embeddings: list[list[float]]):
result = build_typed_embeddings(
sample_embeddings,
["float", "binary", "ubinary", "base64"],
)
assert result.float is not None
assert result.binary is not None
assert result.ubinary is not None
assert result.base64 is not None
def test_subset_types(self, sample_embeddings: list[list[float]]):
result = build_typed_embeddings(sample_embeddings, ["float", "binary"])
assert result.float is not None
assert result.binary is not None
assert result.ubinary is None
assert result.base64 is None
def test_unknown_type_ignored(self, sample_embeddings: list[list[float]]):
result = build_typed_embeddings(sample_embeddings, ["float", "unknown_type"])
assert result.float is not None

View File

@@ -159,6 +159,52 @@ def test_quantize_to_fp4(
torch.testing.assert_close(scale_ans, scale_ref)
@pytest.mark.parametrize(
"shape",
[(32, 4096), (128, 4096), (1, 64), (127, 1024), (256, 16384)],
)
@pytest.mark.parametrize("is_sf_swizzled_layout", [True, False])
@torch.inference_mode()
def test_python_util_matches_cpp_allocation(
shape: tuple[int, int],
is_sf_swizzled_layout: bool,
) -> None:
"""
Verify that the Python utility (create_fp4_output_tensors) allocates
tensors with the same shapes and dtypes as the C++ functional variant
(scaled_fp4_quant_func).
"""
from vllm._custom_ops import create_fp4_output_tensors
torch.set_default_device("cuda:0")
m, n = shape
input_tensor = torch.randn((m, n), dtype=torch.bfloat16)
input_scale = torch.tensor([1.0], dtype=torch.float32, device="cuda:0")
# C++ functional variant allocates internally
cpp_out, cpp_scale = torch.ops._C.scaled_fp4_quant(
input_tensor, input_scale, is_sf_swizzled_layout
)
# Python utility
py_out, py_scale = create_fp4_output_tensors(
m, n, torch.device("cuda:0"), is_sf_swizzled_layout
)
assert py_out.shape == cpp_out.shape, (
f"Output shape mismatch: Python {py_out.shape} vs C++ {cpp_out.shape}"
)
assert py_out.dtype == cpp_out.dtype, (
f"Output dtype mismatch: Python {py_out.dtype} vs C++ {cpp_out.dtype}"
)
assert py_scale.shape == cpp_scale.shape, (
f"Scale shape mismatch: Python {py_scale.shape} vs C++ {cpp_scale.shape}"
)
assert py_scale.dtype == cpp_scale.dtype, (
f"Scale dtype mismatch: Python {py_scale.dtype} vs C++ {cpp_scale.dtype}"
)
@pytest.mark.parametrize("pad_shape", PAD_SHAPES)
@torch.inference_mode()
def test_quantize_to_fp4_padded(pad_shape: tuple[int, int]) -> None:

View File

@@ -777,6 +777,7 @@ VLM_TEST_SETTINGS = {
max_model_len=8192,
max_num_seqs=2,
auto_cls=AutoModelForCausalLM,
patch_hf_runner=model_utils.paddleocr_vl_patch_hf_runner,
image_size_factors=[(0.25,)],
marks=[
pytest.mark.skipif(

View File

@@ -1149,6 +1149,31 @@ def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
return hf_model
def paddleocr_vl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
"""Patches the HfRunner to fix create_causal_mask API mismatch.
The PaddleOCR-VL HF model passes `inputs_embeds` to create_causal_mask,
but transformers renamed this parameter to `input_embeds`.
"""
import sys
model_module = sys.modules.get(type(hf_model.model.model).__module__)
if model_module is None:
return hf_model
original_create_causal_mask = getattr(model_module, "create_causal_mask", None)
if original_create_causal_mask is None:
return hf_model
def patched_create_causal_mask(*args, **kwargs):
if "inputs_embeds" in kwargs:
kwargs["input_embeds"] = kwargs.pop("inputs_embeds")
return original_create_causal_mask(*args, **kwargs)
model_module.create_causal_mask = patched_create_causal_mask # type: ignore[attr-defined]
return hf_model
def qwen2_5_omni_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
"""Patches and returns an instance of the HfRunner for Qwen2.5-Omni."""
thinker = hf_model.model.thinker

View File

@@ -29,6 +29,81 @@ else:
from torch.library import impl_abstract as register_fake
# scaled_fp4_quant functional + out variant for torch.compile buffer management
def create_fp4_scale_tensor(
m: int,
n: int,
device: torch.device,
is_sf_swizzled_layout: bool,
) -> torch.Tensor:
"""
Allocate the output scale tensor for scaled_fp4_quant.
When is_sf_swizzled_layout=True, we use rounded values to store the
swizzled scales. Due to the requirement of the Tensor Core, the minimum
tile is 128x4 for the scales. So, we first pad the scales to multiples
of 128 (rows) and 4 (cols). Then, the scales (in float8_e4m3fn) are
packed into an int32 for every 4 values. More:
https://docs.nvidia.com/cuda/parallel-thread-execution/
#tcgen05-mma-scale-factor-b-layout-4x
"""
from vllm.utils.math_utils import round_up
block_size = 16
if is_sf_swizzled_layout:
rounded_m = round_up(m, 128)
scale_n = n // block_size
rounded_n = round_up(scale_n, 4)
return torch.empty(
(rounded_m, rounded_n // 4), device=device, dtype=torch.int32
)
else:
return torch.empty((m, n // block_size), device=device, dtype=torch.uint8)
def create_fp4_output_tensors(
m: int,
n: int,
device: torch.device,
is_sf_swizzled_layout: bool,
) -> tuple[torch.Tensor, torch.Tensor]:
"""
Allocate both output tensors for scaled_fp4_quant:
(quantized_output, output_scale).
Must match the C++ scaled_fp4_quant_func allocation exactly.
"""
output = torch.empty((m, n // 2), device=device, dtype=torch.uint8)
output_scale = create_fp4_scale_tensor(m, n, device, is_sf_swizzled_layout)
return output, output_scale
if hasattr(torch.ops, "_C") and hasattr(torch.ops._C, "scaled_fp4_quant"):
@register_fake("_C::scaled_fp4_quant")
def _scaled_fp4_quant_fake(
input: torch.Tensor,
input_scale: torch.Tensor,
is_sf_swizzled_layout: bool,
) -> tuple[torch.Tensor, torch.Tensor]:
n = input.shape[-1]
m = input.numel() // n
return create_fp4_output_tensors(m, n, input.device, is_sf_swizzled_layout)
@register_fake("_C::scaled_fp4_quant.out")
def _scaled_fp4_quant_out_fake(
input: torch.Tensor,
input_scale: torch.Tensor,
is_sf_swizzled_layout: bool,
*,
output: torch.Tensor,
output_scale: torch.Tensor,
) -> None:
return None
# page attention ops
def paged_attention_v1(
out: torch.Tensor,
@@ -1644,7 +1719,6 @@ def scaled_fp4_quant(
input = input.reshape(other_dims, input.shape[-1])
m, n = input.shape
block_size = 16
device = input.device
assert n % block_size == 0, f"last dim has to be multiple of 16, but got {n}."
assert input.dtype in (torch.float16, torch.bfloat16), (
@@ -1658,26 +1732,16 @@ def scaled_fp4_quant(
input, input_global_scale
)
else:
# Two fp4 values will be packed into an uint8.
output = torch.empty((m, n // 2), device=device, dtype=torch.uint8)
if is_sf_swizzled_layout:
# We use the rounded values to store the swizzled values. Due to the
# requirement of the Tensor Core, the minimum tile is 128x4 for the scales.
# So, we first pad the scales to multiples of 128 and 4. Then, the scales
# (in float8_e4m3fn) are packed into an int32 for every 4 values. More:
# https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-scale-factor-b-layout-4x
round_up = lambda x, y: (x + y - 1) // y * y
rounded_m = round_up(m, 128)
scale_n = n // block_size
rounded_n = round_up(scale_n, 4)
output_scale = torch.empty(
(rounded_m, rounded_n // 4), device=device, dtype=torch.int32
)
else:
output_scale = torch.empty((m, n // 16), device=device, dtype=torch.uint8)
torch.ops._C.scaled_fp4_quant(
output, input, output_scale, input_global_scale, is_sf_swizzled_layout
# Pre-allocate and call .out variant (same behavior as old in-place API)
output, output_scale = create_fp4_output_tensors(
m, n, input.device, is_sf_swizzled_layout
)
torch.ops._C.scaled_fp4_quant.out(
input,
input_global_scale,
is_sf_swizzled_layout,
output=output,
output_scale=output_scale,
)
output_scale = output_scale.view(torch.float8_e4m3fn)

View File

@@ -148,11 +148,11 @@ class SiluMulNvfp4QuantPattern(ActivationQuantPattern):
result_silu_mul = self.silu_and_mul_matcher(input)
at = auto_functionalized(
self.QUANT_OP,
output=result,
input=result_silu_mul,
output_scale=output_scale,
input_scale=scale,
is_sf_swizzled_layout=True,
output=result,
output_scale=output_scale,
)
return at[1], at[2]

View File

@@ -47,7 +47,7 @@ if find_spec("flashinfer"):
pass
if hasattr(torch.ops._C, "scaled_fp4_quant"):
STATIC_FP4_QUANT_OP = torch.ops._C.scaled_fp4_quant.default
STATIC_FP4_QUANT_OP = torch.ops._C.scaled_fp4_quant.out
# Max size of the input tensor per world size per device capability
# to use flashinfer fused allreduce
@@ -562,11 +562,11 @@ class AllReduceFusedRMSNormStaticQuantNVFP4Pattern(BasePattern):
rms = self.rmsnorm_matcher(all_reduce, weight)
quant_out_tuple = auto_functionalized(
STATIC_FP4_QUANT_OP,
output=quant_result,
input=rms,
output_scale=output_scale,
input_scale=input_global_scale,
is_sf_swizzled_layout=True,
output=quant_result,
output_scale=output_scale,
)
# quant_out, allreduce_output, output_scale
@@ -660,11 +660,11 @@ class AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern(BasePattern):
rms, residual = self.rmsnorm_matcher(allreduce_output, weight, residual)
quant_out_tuple = auto_functionalized(
STATIC_FP4_QUANT_OP,
output=quant_result,
input=rms,
output_scale=output_scale,
input_scale=input_global_scale,
is_sf_swizzled_layout=True,
output=quant_result,
output_scale=output_scale,
)
# quant_out, allreduce_output, output_scale

View File

@@ -250,11 +250,11 @@ class AttentionNvfp4QuantPattern(AttentionQuantPattern):
)
at2 = auto_functionalized(
self.QUANT_OP,
output=output_quant,
input=attn_out_view,
output_scale=output_scale,
input_scale=input_scale,
is_sf_swizzled_layout=True,
output=output_quant,
output_scale=output_scale,
)
output_scale_view = torch.ops.aten.view.dtype(at2[2], FP8_DTYPE)
return at2[1], output_scale_view

View File

@@ -38,7 +38,7 @@ QUANT_OPS: dict[QuantKey, OpOverload] = {
}
if current_platform.is_cuda() and hasattr(torch.ops._C, "scaled_fp4_quant"):
QUANT_OPS[kNvfp4Dynamic] = torch.ops._C.scaled_fp4_quant.default # noqa: E501
QUANT_OPS[kNvfp4Dynamic] = torch.ops._C.scaled_fp4_quant.out # noqa: E501
if current_platform.is_cuda():
QUANT_OPS[kFp8Dynamic128Sym] = torch.ops._C.per_token_group_fp8_quant.default # noqa: E501

View File

@@ -63,7 +63,7 @@ QUANT_OPS: dict[QuantKey, OpOverload] = {
kFp8DynamicTokenSym: torch.ops._C.dynamic_per_token_scaled_fp8_quant.default, # noqa: E501
}
if current_platform.is_cuda() and hasattr(torch.ops._C, "scaled_fp4_quant"):
QUANT_OPS[kNvfp4Dynamic] = torch.ops._C.scaled_fp4_quant.default
QUANT_OPS[kNvfp4Dynamic] = torch.ops._C.scaled_fp4_quant.out
if current_platform.is_cuda():
QUANT_OPS[kFp8Dynamic128Sym] = torch.ops._C.per_token_group_fp8_quant.default # noqa: E501
QUANT_OPS[kFp8Dynamic64Sym] = torch.ops._C.per_token_group_fp8_quant.default # noqa: E501

View File

@@ -112,7 +112,12 @@ class TorchCompileWithNoGuardsWrapper:
entry.guard_type == "SHAPE_ENV" for entry in x
]
else:
options["guard_filter_fn"] = torch.compiler.skip_all_guards_unsafe
if hasattr(torch.compiler, "skip_all_guards_unsafe"):
# Torch 2.10+ provides skip_all_guards_unsafe
options["guard_filter_fn"] = torch.compiler.skip_all_guards_unsafe
else:
# Equivalent fallback for older PyTorch: skip all guards
options["guard_filter_fn"] = lambda x: [False for _ in x]
compiled_ptr: Any = self.forward
# Validate that unbacked dynamic shapes require VLLM_USE_BYTECODE_HOOK=False

View File

@@ -138,6 +138,13 @@ class ParallelConfig:
"""Whether the deployed model is MoE (if known)."""
enable_expert_parallel: bool = False
"""Use expert parallelism instead of tensor parallelism for MoE layers."""
enable_ep_weight_filter: bool = False
"""Skip non-local expert weights during model loading when expert
parallelism is active. Each rank only reads its own expert shard from
disk, which can drastically reduce storage I/O for MoE models with
per-expert weight tensors (e.g. DeepSeek, Mixtral, Kimi-K2.5). Has no
effect on 3D fused-expert checkpoints (e.g. GPT-OSS) or non-MoE
models."""
enable_eplb: bool = False
"""Enable expert parallelism load balancing for MoE layers."""
eplb_config: EPLBConfig = Field(default_factory=EPLBConfig)

View File

@@ -419,6 +419,7 @@ class EngineArgs:
data_parallel_external_lb: bool = False
data_parallel_backend: DataParallelBackend = ParallelConfig.data_parallel_backend
enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
enable_ep_weight_filter: bool = ParallelConfig.enable_ep_weight_filter
moe_backend: MoEBackend = KernelConfig.moe_backend
all2all_backend: All2AllBackend = ParallelConfig.all2all_backend
enable_elastic_ep: bool = ParallelConfig.enable_elastic_ep
@@ -901,6 +902,10 @@ class EngineArgs:
"-ep",
**parallel_kwargs["enable_expert_parallel"],
)
parallel_group.add_argument(
"--enable-ep-weight-filter",
**parallel_kwargs["enable_ep_weight_filter"],
)
parallel_group.add_argument(
"--all2all-backend", **parallel_kwargs["all2all_backend"]
)
@@ -1727,6 +1732,7 @@ class EngineArgs:
data_parallel_hybrid_lb=self.data_parallel_hybrid_lb,
is_moe_model=model_config.is_moe,
enable_expert_parallel=self.enable_expert_parallel,
enable_ep_weight_filter=self.enable_ep_weight_filter,
all2all_backend=self.all2all_backend,
enable_elastic_ep=self.enable_elastic_ep,
enable_dbo=self.enable_dbo,

View File

@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Annotated, Any
from typing import Annotated, Any, Literal
from pydantic import Field, model_validator
@@ -24,6 +24,14 @@ class PoolingBasicRequestMixin(OpenAIBaseModel):
# --8<-- [start:pooling-common-extra-params]
truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
truncation_side: Literal["left", "right"] | None = Field(
default=None,
description=(
"Which side to truncate from when truncate_prompt_tokens is active. "
"'right' keeps the first N tokens. "
"'left' keeps the last N tokens."
),
)
request_id: str = Field(
default_factory=random_uuid,
description=(

View File

@@ -32,6 +32,7 @@ class ClassificationCompletionRequest(
max_total_tokens=model_config.max_model_len,
max_output_tokens=0,
truncate_prompt_tokens=self.truncate_prompt_tokens,
truncation_side=self.truncation_side,
do_lower_case=encoder_config.get("do_lower_case", False),
add_special_tokens=self.add_special_tokens,
max_total_tokens_param="max_model_len",
@@ -54,6 +55,7 @@ class ClassificationChatRequest(
max_total_tokens=model_config.max_model_len,
max_output_tokens=0,
truncate_prompt_tokens=self.truncate_prompt_tokens,
truncation_side=self.truncation_side,
do_lower_case=encoder_config.get("do_lower_case", False),
add_special_tokens=self.add_special_tokens,
max_total_tokens_param="max_model_len",

View File

@@ -7,12 +7,12 @@ from fastapi import APIRouter, Depends, Request
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.pooling.embed.protocol import EmbeddingRequest
from vllm.entrypoints.pooling.embed.serving import ServingEmbedding
from vllm.entrypoints.utils import (
load_aware_call,
with_cancellation,
from vllm.entrypoints.pooling.embed.protocol import (
CohereEmbedRequest,
EmbeddingRequest,
)
from vllm.entrypoints.pooling.embed.serving import ServingEmbedding
from vllm.entrypoints.utils import load_aware_call, with_cancellation
router = APIRouter()
@@ -40,3 +40,24 @@ async def create_embedding(
raise NotImplementedError("The model does not support Embeddings API")
return await handler(request, raw_request)
@router.post(
"/v2/embed",
dependencies=[Depends(validate_json_request)],
responses={
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
},
)
@with_cancellation
@load_aware_call
async def create_cohere_embedding(
request: CohereEmbedRequest,
raw_request: Request,
):
handler = embedding(raw_request)
if handler is None:
raise NotImplementedError("The model does not support Embeddings API")
return await handler(request, raw_request)

View File

@@ -1,14 +1,37 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any, cast
from collections.abc import Sequence
from typing import Any, Literal, cast
import torch
from openai.types.chat import (
ChatCompletionContentPartImageParam,
ChatCompletionContentPartTextParam,
)
from openai.types.chat.chat_completion_content_part_image_param import ImageURL
from vllm import PoolingParams
from vllm.entrypoints.chat_utils import (
ChatCompletionContentPartParam,
ChatCompletionMessageParam,
CustomChatCompletionMessageParam,
)
from vllm.entrypoints.pooling.base.io_processor import PoolingIOProcessor
from vllm.entrypoints.pooling.embed.protocol import (
CohereEmbedInput,
CohereEmbedRequest,
EmbeddingChatRequest,
EmbeddingCompletionRequest,
)
from vllm.entrypoints.pooling.typing import PoolingServeContext
from vllm.inputs.data import ProcessorInputs, token_inputs
from vllm.logger import init_logger
from vllm.outputs import PoolingOutput, PoolingRequestOutput
from vllm.renderers import merge_kwargs
from vllm.utils.collection_utils import chunk_list
from vllm.utils.mistral import is_mistral_tokenizer
logger = init_logger(__name__)
class EmbedIOProcessor(PoolingIOProcessor):
@@ -21,16 +44,45 @@ class EmbedIOProcessor(PoolingIOProcessor):
self.pooler_config = self.model_config.pooler_config
self.enable_chunked_processing = self.pooler_config.enable_chunked_processing
# Load task instructions from HF config or sentence-transformers config
self.task_instructions: dict[str, str] | None = self._load_task_instructions(
self.model_config.hf_config
) or self._load_st_prompts(self.model_config.model, self.model_config.revision)
if self.task_instructions:
logger.info(
"Loaded prompt prefixes for input_type: %s",
list(self.task_instructions.keys()),
)
def pre_process_online(self, ctx: PoolingServeContext):
if isinstance(ctx.request, CohereEmbedRequest):
self._pre_process_cohere_online(ctx)
else:
super().pre_process_online(ctx)
if self.enable_chunked_processing:
self._pre_process_chunked(ctx)
def post_process_online(
self,
ctx: PoolingServeContext,
):
if ctx.final_res_batch is None:
raise ValueError("Final response batch not available")
if not self.enable_chunked_processing:
self._enforce_cohere_max_tokens(ctx)
return super().post_process_online(ctx)
self._post_process_chunked(ctx)
self._enforce_cohere_max_tokens(ctx)
#################################################################
# Long Text Embedding with Chunked Processing
# PTAL: examples/pooling/embed/openai_embedding_long_text
#################################################################
def pre_process_online(self, ctx: PoolingServeContext):
super().pre_process_online(ctx)
if not self.enable_chunked_processing:
return None
def _pre_process_chunked(self, ctx: PoolingServeContext) -> None:
if ctx.engine_prompts is None:
raise ValueError("Engine prompts not available")
@@ -61,18 +113,10 @@ class EmbedIOProcessor(PoolingIOProcessor):
ctx.engine_prompts = chunked_engine_prompts
ctx.prompt_request_ids = prompt_request_ids
return None
def post_process_online(
self,
ctx: PoolingServeContext,
):
if ctx.final_res_batch is None:
raise ValueError("Final response batch not available")
if not self.enable_chunked_processing:
return super().post_process_online(ctx)
def _post_process_chunked(self, ctx: PoolingServeContext) -> None:
# Online aggregation for chunked requests to
# minimize memory usage
# Track aggregation state for each prompt
@@ -195,4 +239,245 @@ class EmbedIOProcessor(PoolingIOProcessor):
raise ValueError(f"Result not found for prompt {prompt_idx}")
ctx.final_res_batch = final_res_batch
return None
#################################################################
# Cohere Request Preprocessing & Postprocessing
#################################################################
@staticmethod
def _load_task_instructions(hf_config: Any) -> dict[str, str] | None:
"""Extract ``task_instructions`` from the HF model config."""
ti = getattr(hf_config, "task_instructions", None)
if not isinstance(ti, dict) or not ti:
return None
return {k: v for k, v in ti.items() if isinstance(v, str)}
@staticmethod
def _load_st_prompts(
model: str | Any,
revision: str | None,
) -> dict[str, str] | None:
"""Load ``task_instructions`` from ``config_sentence_transformers.json``."""
from vllm.transformers_utils.repo_utils import get_hf_file_to_dict
try:
cfg = get_hf_file_to_dict(
"config_sentence_transformers.json", str(model), revision
)
except (ValueError, OSError):
return None
if cfg is None:
return None
prompts = cfg.get("prompts")
if not isinstance(prompts, dict) or not prompts:
return None
return {k: v for k, v in prompts.items() if isinstance(v, str)}
@staticmethod
def _mixed_input_to_messages(
inp: CohereEmbedInput,
*,
task_prefix: str | None = None,
) -> list[ChatCompletionMessageParam]:
"""Build chat messages from a mixed text+image input.
When *task_prefix* is given, it is prepended to each text part.
"""
parts: list[ChatCompletionContentPartParam] = []
for item in inp.content:
if item.type == "text" and item.text is not None:
text = task_prefix + item.text if task_prefix else item.text
parts.append(ChatCompletionContentPartTextParam(type="text", text=text))
elif item.type == "image_url" and item.image_url is not None:
parts.append(
ChatCompletionContentPartImageParam(
type="image_url",
image_url=ImageURL(url=item.image_url["url"]),
)
)
return [CustomChatCompletionMessageParam(role="user", content=parts)]
@staticmethod
def _check_cohere_max_tokens(
outputs: list[PoolingRequestOutput],
max_tokens_check: int | None,
) -> None:
"""Raise if any output exceeds *max_tokens_check* tokens.
Used to enforce ``truncate=NONE`` with an explicit ``max_tokens``:
the pipeline runs without truncation and we reject afterwards.
"""
if max_tokens_check is None:
return
for out in outputs:
n = len(out.prompt_token_ids)
if n > max_tokens_check:
raise ValueError(
f"Input of {n} tokens exceeds max_tokens={max_tokens_check} "
"with truncate=NONE. Set truncate to END or START to "
"allow truncation."
)
@staticmethod
def _resolve_cohere_truncation(
request: CohereEmbedRequest,
) -> tuple[int | None, Literal["left", "right"] | None]:
"""Return ``(truncate_prompt_tokens, truncation_side)``."""
if request.truncate == "NONE":
return None, None
if request.truncate == "START":
tokens = request.max_tokens if request.max_tokens is not None else -1
return tokens, "left"
if request.max_tokens is not None:
return request.max_tokens, None
return -1, None
def create_pooling_params(self, request):
if isinstance(request, CohereEmbedRequest):
return PoolingParams(
task="embed",
dimensions=request.output_dimension,
)
return super().create_pooling_params(request)
def _pre_process_cohere_online(self, ctx: PoolingServeContext) -> None:
"""Convert a ``CohereEmbedRequest`` into engine prompts.
For texts, a single batched completion request path is used.
For images and mixed inputs, conversations are batch-rendered
through the chat template in one ``render_chat`` call.
"""
request = ctx.request
assert isinstance(request, CohereEmbedRequest)
if request.texts is None and request.images is None and request.inputs is None:
raise ValueError("One of texts, images, or inputs must be provided")
truncate_prompt_tokens, truncation_side = self._resolve_cohere_truncation(
request
)
input_type = request.input_type
self._validate_input_type(input_type)
if request.images is not None:
all_messages: list[list[ChatCompletionMessageParam]] = [
[
CustomChatCompletionMessageParam(
role="user",
content=[{"type": "image_url", "image_url": {"url": uri}}],
)
]
for uri in request.images
]
ctx.engine_prompts = self._batch_render_chat(
request, all_messages, truncate_prompt_tokens, truncation_side
)
elif request.inputs is not None:
task_prefix = self._get_task_instruction_prefix(input_type)
all_messages = [
self._mixed_input_to_messages(inp, task_prefix=task_prefix)
for inp in request.inputs
]
ctx.engine_prompts = self._batch_render_chat(
request, all_messages, truncate_prompt_tokens, truncation_side
)
else:
prefixed = self._apply_task_instruction(request.texts or [], input_type)
proxy = EmbeddingCompletionRequest(
model=request.model,
input=prefixed,
dimensions=request.output_dimension,
encoding_format="float",
truncate_prompt_tokens=truncate_prompt_tokens,
truncation_side=truncation_side,
)
ctx.engine_prompts = self._preprocess_completion_online(
proxy, prompt_input=proxy.input, prompt_embeds=None
)
def _batch_render_chat(
self,
request: CohereEmbedRequest,
all_messages: Sequence[list[ChatCompletionMessageParam]],
truncate_prompt_tokens: int | None,
truncation_side: Literal["left", "right"] | None,
) -> list[ProcessorInputs]:
"""Batch-render multiple conversations through the chat template."""
if not all_messages:
return []
proxy = EmbeddingChatRequest(
model=request.model,
messages=list(all_messages[0]),
dimensions=request.output_dimension,
encoding_format="float",
truncate_prompt_tokens=truncate_prompt_tokens,
truncation_side=truncation_side,
)
renderer = self.renderer
mm_config = self.model_config.multimodal_config
tok_params = proxy.build_tok_params(self.model_config)
chat_params = proxy.build_chat_params(
self.chat_template,
self.chat_template_content_format,
).with_defaults(
merge_kwargs(
None,
dict(
tools=None,
tokenize=is_mistral_tokenizer(renderer.tokenizer),
),
),
default_media_io_kwargs=(mm_config.media_io_kwargs if mm_config else None),
)
_, engine_prompts = renderer.render_chat(all_messages, chat_params, tok_params)
return engine_prompts
def _validate_input_type(self, input_type: str | None) -> None:
"""Raise if *input_type* is not supported by this model."""
if input_type is None:
return
if self.task_instructions is None:
raise ValueError(
f"Unsupported input_type {input_type!r}. "
"This model does not define any input_type task instructions."
)
if input_type not in self.task_instructions:
supported = ", ".join(sorted(self.task_instructions))
raise ValueError(
f"Unsupported input_type {input_type!r}. Supported values: {supported}"
)
def _apply_task_instruction(
self,
texts: list[str],
input_type: str | None,
) -> list[str]:
"""Prepend the task-instruction prefix for *input_type*.
Returns *texts* unchanged when no matching prefix is configured.
"""
prefix = self._get_task_instruction_prefix(input_type)
if not prefix:
return texts
return [prefix + t for t in texts]
def _get_task_instruction_prefix(self, input_type: str | None) -> str | None:
"""Return the task-instruction prefix for *input_type*, or ``None``."""
if not self.task_instructions or input_type is None:
return None
return self.task_instructions.get(input_type) or None
def _enforce_cohere_max_tokens(self, ctx: PoolingServeContext) -> None:
if isinstance(ctx.request, CohereEmbedRequest):
request = ctx.request
if request.truncate == "NONE" and request.max_tokens is not None:
self._check_cohere_max_tokens(ctx.final_res_batch, request.max_tokens)

View File

@@ -1,9 +1,19 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import time
from typing import TypeAlias
"""Embedding API protocol models for OpenAI and Cohere formats.
from pydantic import Field
OpenAI: https://platform.openai.com/docs/api-reference/embeddings
Cohere: https://docs.cohere.com/reference/embed
"""
import base64
import builtins
import struct
import time
from collections.abc import Sequence
from typing import Literal, TypeAlias
from pydantic import BaseModel, Field
from vllm import PoolingParams
from vllm.config import ModelConfig
@@ -17,6 +27,10 @@ from vllm.entrypoints.pooling.base.protocol import (
from vllm.renderers import TokenizeParams
from vllm.utils import random_uuid
# ---------------------------------------------------------------------------
# OpenAI /v1/embeddings — request models
# ---------------------------------------------------------------------------
def _get_max_total_output_tokens(
model_config: ModelConfig,
@@ -50,6 +64,7 @@ class EmbeddingCompletionRequest(
max_total_tokens=max_total_tokens,
max_output_tokens=max_output_tokens,
truncate_prompt_tokens=self.truncate_prompt_tokens,
truncation_side=self.truncation_side,
do_lower_case=encoder_config.get("do_lower_case", False),
add_special_tokens=self.add_special_tokens,
max_total_tokens_param="max_model_len",
@@ -79,6 +94,7 @@ class EmbeddingChatRequest(
max_total_tokens=max_total_tokens,
max_output_tokens=max_output_tokens,
truncate_prompt_tokens=self.truncate_prompt_tokens,
truncation_side=self.truncation_side,
do_lower_case=encoder_config.get("do_lower_case", False),
add_special_tokens=self.add_special_tokens,
max_total_tokens_param="max_model_len",
@@ -96,6 +112,11 @@ class EmbeddingChatRequest(
EmbeddingRequest: TypeAlias = EmbeddingCompletionRequest | EmbeddingChatRequest
# ---------------------------------------------------------------------------
# OpenAI /v1/embeddings — response models
# ---------------------------------------------------------------------------
class EmbeddingResponseData(OpenAIBaseModel):
index: int
object: str = "embedding"
@@ -106,7 +127,7 @@ class EmbeddingResponse(OpenAIBaseModel):
id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
object: str = "list"
created: int = Field(default_factory=lambda: int(time.time()))
model: str
model: str | None = None
data: list[EmbeddingResponseData]
usage: UsageInfo
@@ -115,3 +136,146 @@ class EmbeddingBytesResponse(OpenAIBaseModel):
content: list[bytes]
headers: dict[str, str] | None = None
media_type: str = "application/octet-stream"
# ---------------------------------------------------------------------------
# Cohere /v2/embed — request models
# ---------------------------------------------------------------------------
CohereEmbeddingType = Literal[
"float",
"binary",
"ubinary",
"base64",
]
CohereTruncate = Literal["NONE", "START", "END"]
class CohereEmbedContent(BaseModel):
type: Literal["text", "image_url"]
text: str | None = None
image_url: dict[str, str] | None = None
class CohereEmbedInput(BaseModel):
content: list[CohereEmbedContent]
class CohereEmbedRequest(BaseModel):
model: str | None = None
input_type: str | None = None
texts: list[str] | None = None
images: list[str] | None = None
inputs: list[CohereEmbedInput] | None = None
output_dimension: int | None = None
embedding_types: list[CohereEmbeddingType] | None = None
truncate: CohereTruncate = "END"
max_tokens: int | None = None
priority: int = 0
# ---------------------------------------------------------------------------
# Cohere /v2/embed — response models
# ---------------------------------------------------------------------------
class CohereApiVersion(BaseModel):
version: str = "2"
class CohereBilledUnits(BaseModel):
input_tokens: int | None = None
image_tokens: int | None = None
class CohereMeta(BaseModel):
api_version: CohereApiVersion = Field(default_factory=CohereApiVersion)
billed_units: CohereBilledUnits | None = None
class CohereEmbedByTypeEmbeddings(BaseModel):
# The field name ``float`` shadows the builtin type, so the annotation
# must use ``builtins.float`` to avoid a self-referential type error.
float: list[list[builtins.float]] | None = None
binary: list[list[int]] | None = None
ubinary: list[list[int]] | None = None
base64: list[str] | None = None
class CohereEmbedResponse(BaseModel):
id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
embeddings: CohereEmbedByTypeEmbeddings
texts: list[str] | None = None
meta: CohereMeta | None = None
response_type: Literal["embeddings_by_type"] = "embeddings_by_type"
# ---------------------------------------------------------------------------
# Cohere embedding type conversion helpers
# ---------------------------------------------------------------------------
_UNSIGNED_TO_SIGNED_DIFF = 1 << 7 # 128
def _pack_binary_embeddings(
float_embeddings: list[list[float]],
signed: bool,
) -> list[list[int]]:
"""Bit-pack float embeddings: positive -> 1, negative -> 0.
Each bit is shifted left by ``7 - idx%8``, and every 8 bits are packed
into one byte.
"""
result: list[list[int]] = []
for embedding in float_embeddings:
dim = len(embedding)
if dim % 8 != 0:
raise ValueError(
"Embedding dimension must be a multiple of 8 for binary "
f"embedding types, but got {dim}."
)
packed_len = dim // 8
packed: list[int] = []
byte_val = 0
for idx, value in enumerate(embedding):
bit = 1 if value >= 0 else 0
byte_val += bit << (7 - idx % 8)
if (idx + 1) % 8 == 0:
if signed:
byte_val -= _UNSIGNED_TO_SIGNED_DIFF
packed.append(byte_val)
byte_val = 0
assert len(packed) == packed_len
result.append(packed)
return result
def _encode_base64_embeddings(
float_embeddings: list[list[float]],
) -> list[str]:
"""Encode float embeddings as base64 (little-endian float32)."""
result: list[str] = []
for embedding in float_embeddings:
buf = struct.pack(f"<{len(embedding)}f", *embedding)
result.append(base64.b64encode(buf).decode("utf-8"))
return result
def build_typed_embeddings(
float_embeddings: list[list[float]],
embedding_types: Sequence[str],
) -> CohereEmbedByTypeEmbeddings:
"""Convert float embeddings to all requested Cohere embedding types."""
result = CohereEmbedByTypeEmbeddings()
for emb_type in embedding_types:
if emb_type == "float":
result.float = float_embeddings
elif emb_type == "binary":
result.binary = _pack_binary_embeddings(float_embeddings, signed=True)
elif emb_type == "ubinary":
result.ubinary = _pack_binary_embeddings(float_embeddings, signed=False)
elif emb_type == "base64":
result.base64 = _encode_base64_embeddings(float_embeddings)
return result

View File

@@ -5,7 +5,7 @@ from collections.abc import Callable
from functools import partial
from typing import Literal, TypeAlias, cast
from fastapi.responses import JSONResponse, StreamingResponse
from fastapi.responses import JSONResponse, Response, StreamingResponse
from typing_extensions import assert_never
from vllm.config import ModelConfig
@@ -14,10 +14,15 @@ from vllm.entrypoints.openai.engine.protocol import UsageInfo
from vllm.entrypoints.pooling.base.serving import PoolingServing
from vllm.entrypoints.pooling.embed.io_processor import EmbedIOProcessor
from vllm.entrypoints.pooling.embed.protocol import (
CohereBilledUnits,
CohereEmbedRequest,
CohereEmbedResponse,
CohereMeta,
EmbeddingBytesResponse,
EmbeddingRequest,
EmbeddingResponse,
EmbeddingResponseData,
build_typed_embeddings,
)
from vllm.entrypoints.pooling.typing import PoolingServeContext
from vllm.entrypoints.pooling.utils import (
@@ -26,24 +31,23 @@ from vllm.entrypoints.pooling.utils import (
encode_pooling_output_float,
get_json_response_cls,
)
from vllm.logger import init_logger
from vllm.outputs import PoolingRequestOutput
from vllm.renderers import BaseRenderer
from vllm.utils.serial_utils import EmbedDType, Endianness
logger = init_logger(__name__)
JSONResponseCLS = get_json_response_cls()
EmbeddingServeContext: TypeAlias = PoolingServeContext[EmbeddingRequest]
class ServingEmbedding(PoolingServing):
"""
Embedding API similar to OpenAI's API.
See https://platform.openai.com/docs/api-reference/embeddings/create
for the API specification. This API mimics the OpenAI Embedding API.
"""
"""Embedding API supporting both OpenAI and Cohere formats."""
request_id_prefix = "embd"
io_processor: EmbedIOProcessor
def init_io_processor(
self,
@@ -58,6 +62,14 @@ class ServingEmbedding(PoolingServing):
)
async def _build_response(
self,
ctx: PoolingServeContext,
) -> Response:
if isinstance(ctx.request, CohereEmbedRequest):
return self._build_cohere_response_from_ctx(ctx)
return await self._build_openai_response(ctx)
async def _build_openai_response(
self,
ctx: EmbeddingServeContext,
) -> JSONResponse | StreamingResponse:
@@ -66,7 +78,7 @@ class ServingEmbedding(PoolingServing):
endianness = ctx.request.endianness
if encoding_format == "float" or encoding_format == "base64":
return self._request_output_to_embed_json_response(
return self._openai_json_response(
ctx.final_res_batch,
ctx.request_id,
ctx.created_time,
@@ -77,7 +89,7 @@ class ServingEmbedding(PoolingServing):
)
if encoding_format == "bytes" or encoding_format == "bytes_only":
return self._request_output_to_to_embed_bytes_response(
return self._openai_bytes_response(
ctx.final_res_batch,
ctx.request_id,
ctx.created_time,
@@ -89,7 +101,7 @@ class ServingEmbedding(PoolingServing):
assert_never(encoding_format)
def _request_output_to_embed_json_response(
def _openai_json_response(
self,
final_res_batch: list[PoolingRequestOutput],
request_id: str,
@@ -139,7 +151,7 @@ class ServingEmbedding(PoolingServing):
)
return JSONResponseCLS(content=response.model_dump())
def _request_output_to_to_embed_bytes_response(
def _openai_bytes_response(
self,
final_res_batch: list[PoolingRequestOutput],
request_id: str,
@@ -177,3 +189,33 @@ class ServingEmbedding(PoolingServing):
headers=response.headers,
media_type=response.media_type,
)
@staticmethod
def _build_cohere_response_from_ctx(
ctx: PoolingServeContext,
) -> JSONResponse:
request = ctx.request
assert isinstance(request, CohereEmbedRequest)
all_floats = [encode_pooling_output_float(out) for out in ctx.final_res_batch]
total_tokens = sum(len(out.prompt_token_ids) for out in ctx.final_res_batch)
image_tokens = total_tokens if request.images is not None else 0
texts_echo = request.texts
embedding_types = request.embedding_types or ["float"]
embeddings_obj = build_typed_embeddings(all_floats, embedding_types)
input_tokens = total_tokens - image_tokens
response = CohereEmbedResponse(
id=ctx.request_id,
embeddings=embeddings_obj,
texts=texts_echo,
meta=CohereMeta(
billed_units=CohereBilledUnits(
input_tokens=input_tokens,
image_tokens=image_tokens,
),
),
)
return JSONResponse(content=response.model_dump(exclude_none=True))

View File

@@ -36,6 +36,7 @@ class PoolingCompletionRequest(
max_total_tokens=model_config.max_model_len,
max_output_tokens=0,
truncate_prompt_tokens=self.truncate_prompt_tokens,
truncation_side=self.truncation_side,
do_lower_case=encoder_config.get("do_lower_case", False),
add_special_tokens=self.add_special_tokens,
max_total_tokens_param="max_model_len",
@@ -61,6 +62,7 @@ class PoolingChatRequest(
max_total_tokens=model_config.max_model_len,
max_output_tokens=0,
truncate_prompt_tokens=self.truncate_prompt_tokens,
truncation_side=self.truncation_side,
do_lower_case=encoder_config.get("do_lower_case", False),
add_special_tokens=self.add_special_tokens,
max_total_tokens_param="max_model_len",
@@ -88,6 +90,7 @@ class IOProcessorRequest(PoolingBasicRequestMixin, EncodingRequestMixin, Generic
max_total_tokens=model_config.max_model_len,
max_output_tokens=0,
truncate_prompt_tokens=self.truncate_prompt_tokens,
truncation_side=self.truncation_side,
do_lower_case=encoder_config.get("do_lower_case", False),
add_special_tokens=not model_config.is_encoder_decoder,
max_total_tokens_param="max_model_len",

View File

@@ -30,6 +30,7 @@ class ScoreRequestMixin(PoolingBasicRequestMixin, ClassifyRequestMixin):
max_total_tokens=model_config.max_model_len,
max_output_tokens=0,
truncate_prompt_tokens=self.truncate_prompt_tokens,
truncation_side=self.truncation_side,
do_lower_case=encoder_config.get("do_lower_case", False),
max_total_tokens_param="max_model_len",
)
@@ -105,6 +106,7 @@ class RerankRequest(PoolingBasicRequestMixin, ClassifyRequestMixin):
max_total_tokens=model_config.max_model_len,
max_output_tokens=0,
truncate_prompt_tokens=self.truncate_prompt_tokens,
truncation_side=self.truncation_side,
do_lower_case=encoder_config.get("do_lower_case", False),
max_total_tokens_param="max_model_len",
)

View File

@@ -15,6 +15,7 @@ from vllm.entrypoints.pooling.classify.protocol import (
ClassificationResponse,
)
from vllm.entrypoints.pooling.embed.protocol import (
CohereEmbedRequest,
EmbeddingBytesResponse,
EmbeddingChatRequest,
EmbeddingCompletionRequest,
@@ -50,6 +51,7 @@ AnyPoolingRequest: TypeAlias = (
| IOProcessorRequest
| RerankRequest
| ScoreRequest
| CohereEmbedRequest
)
AnyPoolingResponse: TypeAlias = (

View File

@@ -659,6 +659,13 @@ def run_cutlass_moe_fp4(
class CutlassExpertsFp4(mk.FusedMoEExpertsModular):
"""CUTLASS FP4 fused MoE expert implementation."""
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
# Fuse activation scales into w_scale_2 in-place so that
# g1/g2_alphas (which reference the same tensor) stay in sync
# when EPLB rearranges the parameter.
layer.w13_weight_scale_2.data.mul_(layer.w13_input_scale)
layer.w2_weight_scale_2.data.mul_(layer.w2_input_scale)
@property
def expects_unquantized_inputs(self) -> bool:
return True

View File

@@ -56,10 +56,25 @@ class TrtLlmNvFp4ExpertsBase:
# g1_scale_c = a13_scale * w13_scale_2 / a2_scale
self.g1_scale_c = self.quant_config.g1_alphas * self.quant_config.a2_gscale
else:
self.g1_scale_c = (
torch.ones_like(self.quant_config.a1_gscale)
* self.quant_config.a2_gscale
)
self.g1_scale_c = self.quant_config.a2_gscale.clone()
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
layer.w13_weight_scale_2.data.mul_(layer.w13_input_scale)
layer.w2_weight_scale_2.data.mul_(layer.w2_input_scale)
# Recompute g1_scale_c since g1_alphas was just fused in-place.
# Register as a layer parameter so EPLB rearranges it alongside
# other expert weights.
assert self.quant_config.g1_alphas is not None
assert self.quant_config.a2_gscale is not None
if self.moe_config.is_act_and_mul:
g1_scale_c = self.quant_config.g1_alphas * self.quant_config.a2_gscale
else:
g1_scale_c = self.quant_config.a2_gscale.clone()
layer.register_parameter(
"g1_scale_c",
torch.nn.Parameter(g1_scale_c, requires_grad=False),
)
self.g1_scale_c = layer.g1_scale_c
@staticmethod
def _supports_current_device() -> bool:

View File

@@ -49,6 +49,10 @@ class FlashInferCuteDSLExperts(mk.FusedMoEExpertsModular):
)
self.out_dtype = moe_config.in_dtype
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
layer.w13_weight_scale_2.data.mul_(layer.w13_input_scale)
layer.w2_weight_scale_2.data.mul_(layer.w2_input_scale)
@staticmethod
def activation_format() -> mk.FusedMoEActivationFormat:
return mk.FusedMoEActivationFormat.BatchedExperts

View File

@@ -61,6 +61,11 @@ def is_valid_flashinfer_cutlass_fused_moe(
class FlashInferExperts(mk.FusedMoEExpertsModular):
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
if self.quant_config.use_nvfp4_w4a4:
layer.w13_weight_scale_2.data.mul_(layer.w13_input_scale)
layer.w2_weight_scale_2.data.mul_(layer.w2_input_scale)
def __init__(
self,
moe_config: mk.FusedMoEConfig,

View File

@@ -1421,19 +1421,23 @@ class FusedMoE(CustomOp):
weights = list(self.named_parameters())
weights = [(name, _maybe_make_contiguous(name, p)) for name, p in weights]
# `w13_input_scale` and `w2_input_scale` are global per-tensor
# activation scales shared across all experts (e.g. NVFP4).
# They are broadcast views (stride 0) from .expand() and are
# not actual expert weights, so exclude them from EPLB.
NON_EXPERT_WEIGHTS = {
"e_score_correction_bias",
"w13_input_scale",
"w2_input_scale",
}
assert all(
weight.is_contiguous()
for name, weight in weights
if not (name.startswith("_shared_experts.") or name.startswith("_gate."))
and name not in NON_EXPERT_WEIGHTS
)
# Filter out the non-expert weights.
# `e_score_correction_bias` is a bias for each logical expert,
# with shape (num_logical_experts,), not an expert weight.
NON_EXPERT_WEIGHTS = {
"e_score_correction_bias",
}
return [
weight.view(self.local_num_experts, -1)
for name, weight in weights

View File

@@ -489,6 +489,9 @@ class FusedMoEExperts(ABC):
self.max_num_tokens = max_num_tokens
self.num_dispatchers = num_dispatchers
def process_weights_after_loading(self, layer: torch.nn.Module) -> None: # noqa: B027
pass
@staticmethod
def is_monolithic() -> bool:
raise NotImplementedError("Implemented by subclasses.")

View File

@@ -374,11 +374,13 @@ def make_nvfp4_moe_quant_config(
w2_scale=w2_scale,
)
g1_alphas = a13_scale * w13_scale_2
g2_alphas = a2_scale * w2_scale_2
# Pass w13_scale_2 / w2_scale_2 directly as g1/g2_alphas.
# The expert's process_weights_after_loading will fuse activation
# scales in-place. Since the quant config references the same tensor
# as the registered parameter, EPLB rearrangement stays in sync.
return nvfp4_moe_quant_config(
g1_alphas=g1_alphas,
g2_alphas=g2_alphas,
g1_alphas=w13_scale_2,
g2_alphas=w2_scale_2,
a1_gscale=(1.0 / a13_scale),
a2_gscale=(1.0 / a2_scale),
w1_scale=w13_scale,

View File

@@ -570,6 +570,7 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
shared_experts=layer.shared_experts,
routing_tables=layer._maybe_init_expert_routing_tables(),
)
self.moe_kernel.fused_experts.process_weights_after_loading(layer)
def maybe_make_prepare_finalize(
self,

View File

@@ -1394,6 +1394,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
shared_experts=layer.shared_experts,
routing_tables=layer._maybe_init_expert_routing_tables(),
)
self.moe_kernel.fused_experts.process_weights_after_loading(layer)
def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantConfig:
return make_nvfp4_moe_quant_config(

View File

@@ -267,16 +267,6 @@ def prepare_nvfp4_moe_layer_for_fi_or_cutlass(
num_experts=w13.size(0),
is_gated_activation=is_gated,
)
# We do not need to make this a parameter, because
# it is not used during the weight (re)-loading process.
if is_gated:
layer.g1_scale_c = a13_scale * w13_scale_2 / a2_scale
else:
layer.g1_scale_c = torch.ones_like(a13_scale) / a2_scale
layer.a1_gscale = 1.0 / a13_scale
layer.g1_alphas = a13_scale * w13_scale_2
layer.g2_alphas = a2_scale * w2_scale_2
else:
# Swizzle the block scales for other FI NVFP4 MoE kernels.
w13_scale = swizzle_blockscale(w13_scale)

View File

@@ -313,7 +313,18 @@ class DefaultModelLoader(BaseModelLoader):
vllm_config = get_current_vllm_config()
parallel_config = vllm_config.parallel_config
if not (model_config.is_moe and parallel_config.enable_expert_parallel):
if not (
model_config.is_moe
and parallel_config.enable_expert_parallel
and parallel_config.enable_ep_weight_filter
):
return
# When EPLB is enabled, redundant physical expert slots may map to
# logical experts that belong to other ranks in the default partition.
# The weight loader needs to see ALL logical expert weights so it can
# populate these redundant slots. Skip the filter entirely.
if parallel_config.enable_eplb:
return
num_experts = model_config.get_num_experts()

View File

@@ -73,4 +73,9 @@ def should_skip_weight(
if eid is None:
# Not an expert weight (dense / shared-expert / embedding) → keep.
return False
# Only skip heavy weight tensors, never scale/metadata tensors.
# Scale tensors are tiny and some backends need them from ALL experts
# (e.g. FlashInfer NVFP4 computes a global max of activation scales).
if not weight_name.endswith(".weight"):
return False
return eid not in local_expert_ids

View File

@@ -246,6 +246,7 @@ class CpuPlatform(Platform):
"size_asserts": False,
"nan_asserts": False,
"epilogue_fusion": True,
"cpp.dynamic_threads": True,
}
)

View File

@@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any, TypeVar
from typing import TYPE_CHECKING, Any, Literal, TypeVar
from vllm.exceptions import VLLMValidationError
from vllm.inputs import EmbedsPrompt, TextPrompt, TokensPrompt
@@ -153,6 +153,14 @@ class TokenizeParams:
- `-1` maps to `max_input_tokens`.
"""
truncation_side: Literal["left", "right"] | None = None
"""
Which side to truncate from when ``truncate_prompt_tokens`` is active:
- ``"right"`` keeps the first N tokens (truncate from the end).
- ``"left"`` keeps the last N tokens (truncate from the start).
- ``None`` falls back to the tokenizer default.
"""
do_lower_case: bool = False
"""Whether to normalize text to lower case before tokenization."""
@@ -271,6 +279,7 @@ class TokenizeParams:
),
pad_prompt_tokens=pad_prompt_tokens,
truncate_prompt_tokens=truncate_prompt_tokens,
truncation_side=self.truncation_side,
do_lower_case=do_lower_case,
add_special_tokens=add_special_tokens,
needs_detokenization=needs_detokenization,
@@ -286,6 +295,16 @@ class TokenizeParams:
# while still failing `self._token_len_check` as expected by users
max_length = self.max_input_tokens + 1
# Left-side truncation requires the full token sequence so we can
# slice from the end in _token_truncation. Disable HF-level
# truncation (which would incorrectly truncate from the right for
# pooling models) and let _token_truncation handle it.
if self.truncation_side == "left":
return dict(
truncation=False,
add_special_tokens=self.add_special_tokens,
)
return dict(
truncation=max_length is not None,
max_length=max_length,
@@ -375,7 +394,10 @@ class TokenizeParams:
if max_length == 0:
return tokens[:0]
if getattr(tokenizer, "truncation_side", "left") == "left":
side = self.truncation_side or (
tokenizer.truncation_side if tokenizer is not None else None
)
if side == "left":
return tokens[-max_length:]
return tokens[:max_length]

View File

@@ -162,6 +162,11 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
# Share workspace buffer across all executions
self._workspace = g_sm100_workspace
# Pre-allocated output buffer, lazily sized on first call.
# Zero-init once to prevent NaN in padding slots (seq_lens=0)
# from contaminating downstream per-tensor reductions.
self._decode_out: torch.Tensor | None = None
def _sm100_cutlass_mla_decode(
self,
q_nope: torch.Tensor,
@@ -218,7 +223,15 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
if is_quantized_kv_cache(self.kv_cache_dtype)
else q_nope.dtype
)
out = q_nope.new_empty((B_q, MAX_HEADS, D_latent), dtype=dtype)
# Reuse pre-allocated zero-init output buffer to avoid a memset
# kernel on every CUDA graph replay.
if (
self._decode_out is None
or self._decode_out.shape[0] < B_q
or self._decode_out.dtype != dtype
):
self._decode_out = q_nope.new_zeros((B_q, MAX_HEADS, D_latent), dtype=dtype)
out = self._decode_out[:B_q]
lse = (
torch.empty((B_q, MAX_HEADS), dtype=torch.float32, device=q_nope.device)
if self.need_to_return_lse_for_decode

View File

@@ -21,6 +21,7 @@ from vllm.v1.attention.backend import (
AttentionLayer,
AttentionType,
MultipleOf,
is_quantized_kv_cache,
)
from vllm.v1.attention.backends.utils import KVCacheLayoutType
@@ -151,6 +152,11 @@ class FlashInferMLAImpl(MLACommonImpl[MLACommonMetadata]):
self.bmm1_scale: float | None = None
self.bmm2_scale: float | None = None
# Pre-allocated output buffer, lazily sized on first call.
# Zero-init once to prevent NaN in padding slots (seq_lens=0)
# from contaminating downstream per-tensor reductions.
self._decode_out: torch.Tensor | None = None
def forward_mqa(
self,
q: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
@@ -181,6 +187,37 @@ class FlashInferMLAImpl(MLACommonImpl[MLACommonMetadata]):
if self.bmm2_scale is None:
self.bmm2_scale = layer._v_scale_float
# Reuse pre-allocated zero-init output buffer to avoid a memset
# kernel on every CUDA graph replay.
# q is 4D: (batch, q_len_per_req, num_heads, head_dim)
# FlashInfer has a bug where out= validation hardcodes 3D shape
# (batch, num_heads, kv_lora_rank), but the kernel writes 4D
# (batch, q_len, num_heads, kv_lora_rank) when q_len > 1.
# So we can only pass out= for single-token decode (q_len == 1).
# For q_len > 1, we zero padding slots after the kernel returns.
# TODO: upstream fix to FlashInfer
B, q_len_per_req = q.shape[0], q.shape[1]
out_kwargs: dict[str, torch.Tensor] = {}
if q_len_per_req == 1:
dtype = (
torch.bfloat16
if is_quantized_kv_cache(self.kv_cache_dtype)
else q.dtype
)
if (
self._decode_out is None
or self._decode_out.shape[0] < B
or self._decode_out.dtype != dtype
):
self._decode_out = torch.zeros(
B,
q.shape[2],
self.kv_lora_rank,
dtype=dtype,
device=q.device,
)
out_kwargs["out"] = self._decode_out[:B]
o = trtllm_batch_decode_with_kv_cache_mla(
query=q,
kv_cache=kv_c_and_k_pe_cache.unsqueeze(1),
@@ -193,8 +230,15 @@ class FlashInferMLAImpl(MLACommonImpl[MLACommonMetadata]):
max_seq_len=attn_metadata.max_seq_len,
bmm1_scale=self.bmm1_scale,
bmm2_scale=self.bmm2_scale,
**out_kwargs,
)
# For q_len > 1, we can't pass out= so we work around by zeroing padding slots
if not out_kwargs:
num_real = attn_metadata.num_decodes
if num_real < o.shape[0]:
o[num_real:] = 0
# Flatten the output for consistent shape
o = o.view(-1, o.shape[-2], o.shape[-1])

View File

@@ -392,8 +392,10 @@ class Worker(WorkerBase):
)
# Profile CUDA graph memory if graphs will be captured.
# Skip on ROCm/HIP as graph pool handles and mem_get_info behave
# differently and can produce incorrect/negative estimates.
cudagraph_memory_estimate = 0
if not self.model_config.enforce_eager:
if not self.model_config.enforce_eager and not current_platform.is_rocm():
cudagraph_memory_estimate = self.model_runner.profile_cudagraph_memory()
# Use the pre-cudagraph torch peak to avoid double-counting.
@@ -406,6 +408,8 @@ class Worker(WorkerBase):
+ profile_result.weights_memory
)
# On ROCm, cudagraph_memory_estimate is always 0 so this is a no-op.
# On CUDA, respect the opt-in flag as originally designed.
cudagraph_memory_estimate_applied = (
cudagraph_memory_estimate
if envs.VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS
@@ -517,7 +521,6 @@ class Worker(WorkerBase):
def update_max_model_len(self, max_model_len: int) -> None:
"""Update max_model_len after auto-fit to GPU memory.
This is called when max_model_len=-1 is used and the engine
automatically determines the maximum context length that fits
in GPU memory. Workers need to update their cached max_model_len