[Refactor] Clean up pooling serial utils (#33665)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2026-02-03 18:29:18 +08:00
committed by GitHub
parent dad2d6a590
commit 83449a5ff0
9 changed files with 417 additions and 332 deletions

View File

@@ -12,13 +12,12 @@ import json
import requests
import torch
from vllm.utils.serial_utils import (
EMBED_DTYPE_TO_TORCH_DTYPE,
ENDIANNESS,
from vllm.entrypoints.pooling.utils import (
MetadataItem,
build_metadata_items,
decode_pooling_output,
)
from vllm.utils.serial_utils import EMBED_DTYPES, ENDIANNESS
def post_http_request(prompt: dict, api_url: str) -> requests.Response:
@@ -51,7 +50,7 @@ def main(args):
# The OpenAI client does not support the bytes encoding_format.
# The OpenAI client does not support the embed_dtype and endianness parameters.
for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
for embed_dtype in EMBED_DTYPES:
for endianness in ENDIANNESS:
prompt = {
"model": model,
@@ -74,7 +73,7 @@ def main(args):
# The vllm server always sorts the returned embeddings in the order of input. So
# returning metadata is not necessary. You can set encoding_format to bytes_only
# to let the server not return metadata.
for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
for embed_dtype in EMBED_DTYPES:
for endianness in ENDIANNESS:
prompt = {
"model": model,