[Frontend] Binary embedding response does not return metadata by setting encoding_format to bytes_only. (#30249)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io> Signed-off-by: wang.yuqi <noooop@126.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-12-08 20:01:21 +08:00
parent 408cf42f67
commit 2e660c2434
10 changed files with 230 additions and 41 deletions
--- a/vllm/utils/serial_utils.py
+++ b/vllm/utils/serial_utils.py
@@ -2,15 +2,19 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import base64
 import io
+import math
 import sys
 from dataclasses import dataclass
-from typing import Literal
+from typing import TYPE_CHECKING, Any, Literal

 import numpy as np
 import torch
 from typing_extensions import assert_never

-from vllm import PoolingRequestOutput
+if TYPE_CHECKING:
+    from vllm import PoolingRequestOutput
+else:
+    PoolingRequestOutput = Any

 sys_byteorder = sys.byteorder

@@ -27,6 +31,14 @@ EMBED_DTYPE_TO_TORCH_DTYPE = {
    "fp8_e5m2": torch.float8_e5m2,
 }

+EMBED_DTYPE_TO_N_BYTES = {
+    "float32": 4,
+    "float16": 2,
+    "bfloat16": 2,
+    "fp8_e4m3": 1,
+    "fp8_e5m2": 1,
+}
+

 EMBED_DTYPE_TO_TORCH_DTYPE_VIEW = {
    "float32": torch.float32,
@@ -50,7 +62,7 @@ ENDIANNESS = ["native", "big", "little"]

 EmbedDType = Literal["float32", "float16", "bfloat16", "fp8_e4m3", "fp8_e5m2"]
 Endianness = Literal["native", "big", "little"]
-EncodingFormat = Literal["float", "base64", "bytes"]
+EncodingFormat = Literal["float", "base64", "bytes", "bytes_only"]


 def tensor2base64(x: torch.Tensor) -> str:
@@ -114,7 +126,7 @@ def encode_pooling_output(
    elif encoding_format == "base64":
        embedding_bytes = tensor2binary(output.outputs.data, embed_dtype, endianness)
        return base64.b64encode(embedding_bytes).decode("utf-8")
-    elif encoding_format == "bytes":
+    elif encoding_format == "bytes" or encoding_format == "bytes_only":
        return tensor2binary(output.outputs.data, embed_dtype, endianness)
    assert_never(encoding_format)

@@ -129,6 +141,29 @@ class MetadataItem:
    shape: tuple[int, ...]


+def build_metadata_items(
+    embed_dtype: EmbedDType,
+    endianness: Endianness,
+    shape: tuple[int, ...],
+    n_request: int,
+):
+    n_bytes = EMBED_DTYPE_TO_N_BYTES[embed_dtype]
+    size = math.prod(shape)
+    items = [
+        MetadataItem(
+            index=i,
+            embed_dtype=embed_dtype,
+            endianness=endianness,
+            start=i * size * n_bytes,
+            end=(i + 1) * size * n_bytes,
+            shape=shape,
+        )
+        for i in range(n_request)
+    ]
+
+    return items
+
+
 def encode_pooling_bytes(
    pooling_outputs: list[PoolingRequestOutput],
    embed_dtype: EmbedDType,