[Frontend] Binary embedding response does not return metadata by setting encoding_format to bytes_only. (#30249)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Signed-off-by: wang.yuqi <noooop@126.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
wang.yuqi
2025-12-08 20:01:21 +08:00
committed by GitHub
parent 408cf42f67
commit 2e660c2434
10 changed files with 230 additions and 41 deletions

View File

@@ -2,15 +2,19 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import io
import math
import sys
from dataclasses import dataclass
from typing import Literal
from typing import TYPE_CHECKING, Any, Literal
import numpy as np
import torch
from typing_extensions import assert_never
from vllm import PoolingRequestOutput
if TYPE_CHECKING:
from vllm import PoolingRequestOutput
else:
PoolingRequestOutput = Any
sys_byteorder = sys.byteorder
@@ -27,6 +31,14 @@ EMBED_DTYPE_TO_TORCH_DTYPE = {
"fp8_e5m2": torch.float8_e5m2,
}
EMBED_DTYPE_TO_N_BYTES = {
"float32": 4,
"float16": 2,
"bfloat16": 2,
"fp8_e4m3": 1,
"fp8_e5m2": 1,
}
EMBED_DTYPE_TO_TORCH_DTYPE_VIEW = {
"float32": torch.float32,
@@ -50,7 +62,7 @@ ENDIANNESS = ["native", "big", "little"]
EmbedDType = Literal["float32", "float16", "bfloat16", "fp8_e4m3", "fp8_e5m2"]
Endianness = Literal["native", "big", "little"]
EncodingFormat = Literal["float", "base64", "bytes"]
EncodingFormat = Literal["float", "base64", "bytes", "bytes_only"]
def tensor2base64(x: torch.Tensor) -> str:
@@ -114,7 +126,7 @@ def encode_pooling_output(
elif encoding_format == "base64":
embedding_bytes = tensor2binary(output.outputs.data, embed_dtype, endianness)
return base64.b64encode(embedding_bytes).decode("utf-8")
elif encoding_format == "bytes":
elif encoding_format == "bytes" or encoding_format == "bytes_only":
return tensor2binary(output.outputs.data, embed_dtype, endianness)
assert_never(encoding_format)
@@ -129,6 +141,29 @@ class MetadataItem:
shape: tuple[int, ...]
def build_metadata_items(
embed_dtype: EmbedDType,
endianness: Endianness,
shape: tuple[int, ...],
n_request: int,
):
n_bytes = EMBED_DTYPE_TO_N_BYTES[embed_dtype]
size = math.prod(shape)
items = [
MetadataItem(
index=i,
embed_dtype=embed_dtype,
endianness=endianness,
start=i * size * n_bytes,
end=(i + 1) * size * n_bytes,
shape=shape,
)
for i in range(n_request)
]
return items
def encode_pooling_bytes(
pooling_outputs: list[PoolingRequestOutput],
embed_dtype: EmbedDType,