2025-02-02 14:58:18 -05:00
|
|
|
# SPDX-License-Identifier: Apache-2.0
|
2025-06-03 11:20:17 -07:00
|
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
2025-02-02 14:58:18 -05:00
|
|
|
|
2025-04-17 04:28:32 +02:00
|
|
|
import dataclasses
|
2025-07-30 18:20:20 +01:00
|
|
|
import importlib
|
2024-11-12 08:57:14 -08:00
|
|
|
import pickle
|
2026-03-21 13:10:20 -07:00
|
|
|
from abc import ABC, abstractmethod
|
2025-04-10 12:23:14 -07:00
|
|
|
from collections.abc import Callable, Sequence
|
2025-10-27 23:30:38 +08:00
|
|
|
from functools import partial
|
2025-04-10 12:23:14 -07:00
|
|
|
from inspect import isclass
|
2025-03-29 05:39:14 -05:00
|
|
|
from types import FunctionType
|
2026-03-14 17:10:11 +03:00
|
|
|
from typing import Any, ClassVar, TypeAlias, cast, get_type_hints
|
[V1] Logprobs and prompt logprobs support (#9880)
This PR is adding support for sample logprobs & prompt logprobs to vLLM v1.
New behavior:
- During model execution, model runner computes sample logprobs (if user-provided logprobs setting is not None) and prompt logprobs (if user-provided prompt_logprobs setting is not None). For both sample and prompt logprobs, the engine core returns 3 vectors: token ids, token logprob values, token ranks. Ranks reflect tokens' 1-indexed positions in the vocabulary vector after sorting the vocabulary by log probability in descending order.
- In scheduler.update_from_output(), sample and prompt logprobs are incorporated into the EngineCoreOutput data structure which is transferred to the engine client. If multiprocessing is enabled, then sample and prompt logprobs will be (de)serialized when the EngineCoreOutput data structure is (de)serialized.
- During output processing, the LogprobsProcessor transforms the triplet of token ids, token logprobs values, and token ranks into the OpenAI-compatible List[Dict[token id,Logprob]] format (for sample and prompt logprobs respectively.)
- Each Logprob instance (whether sample- or prompt-) consists of a token's log-probability, rank, and detokenized string representation. Note that logprob detokenization is handled by the LogprobsProcessor not the detokenizer.
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-02-07 10:26:20 -05:00
|
|
|
|
2025-03-29 05:39:14 -05:00
|
|
|
import cloudpickle
|
2025-07-30 18:20:20 +01:00
|
|
|
import msgspec
|
2025-04-10 12:23:14 -07:00
|
|
|
import numpy as np
|
[V1] Logprobs and prompt logprobs support (#9880)
This PR is adding support for sample logprobs & prompt logprobs to vLLM v1.
New behavior:
- During model execution, model runner computes sample logprobs (if user-provided logprobs setting is not None) and prompt logprobs (if user-provided prompt_logprobs setting is not None). For both sample and prompt logprobs, the engine core returns 3 vectors: token ids, token logprob values, token ranks. Ranks reflect tokens' 1-indexed positions in the vocabulary vector after sorting the vocabulary by log probability in descending order.
- In scheduler.update_from_output(), sample and prompt logprobs are incorporated into the EngineCoreOutput data structure which is transferred to the engine client. If multiprocessing is enabled, then sample and prompt logprobs will be (de)serialized when the EngineCoreOutput data structure is (de)serialized.
- During output processing, the LogprobsProcessor transforms the triplet of token ids, token logprobs values, and token ranks into the OpenAI-compatible List[Dict[token id,Logprob]] format (for sample and prompt logprobs respectively.)
- Each Logprob instance (whether sample- or prompt-) consists of a token's log-probability, rank, and detokenized string representation. Note that logprob detokenization is handled by the LogprobsProcessor not the detokenizer.
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-02-07 10:26:20 -05:00
|
|
|
import torch
|
2025-04-10 12:23:14 -07:00
|
|
|
import zmq
|
[V1] Logprobs and prompt logprobs support (#9880)
This PR is adding support for sample logprobs & prompt logprobs to vLLM v1.
New behavior:
- During model execution, model runner computes sample logprobs (if user-provided logprobs setting is not None) and prompt logprobs (if user-provided prompt_logprobs setting is not None). For both sample and prompt logprobs, the engine core returns 3 vectors: token ids, token logprob values, token ranks. Ranks reflect tokens' 1-indexed positions in the vocabulary vector after sorting the vocabulary by log probability in descending order.
- In scheduler.update_from_output(), sample and prompt logprobs are incorporated into the EngineCoreOutput data structure which is transferred to the engine client. If multiprocessing is enabled, then sample and prompt logprobs will be (de)serialized when the EngineCoreOutput data structure is (de)serialized.
- During output processing, the LogprobsProcessor transforms the triplet of token ids, token logprobs values, and token ranks into the OpenAI-compatible List[Dict[token id,Logprob]] format (for sample and prompt logprobs respectively.)
- Each Logprob instance (whether sample- or prompt-) consists of a token's log-probability, rank, and detokenized string representation. Note that logprob detokenization is handled by the LogprobsProcessor not the detokenizer.
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-02-07 10:26:20 -05:00
|
|
|
from msgspec import msgpack
|
2025-11-14 17:58:01 +01:00
|
|
|
from pydantic import GetCoreSchemaHandler
|
|
|
|
|
from pydantic_core import core_schema
|
[V1] Logprobs and prompt logprobs support (#9880)
This PR is adding support for sample logprobs & prompt logprobs to vLLM v1.
New behavior:
- During model execution, model runner computes sample logprobs (if user-provided logprobs setting is not None) and prompt logprobs (if user-provided prompt_logprobs setting is not None). For both sample and prompt logprobs, the engine core returns 3 vectors: token ids, token logprob values, token ranks. Ranks reflect tokens' 1-indexed positions in the vocabulary vector after sorting the vocabulary by log probability in descending order.
- In scheduler.update_from_output(), sample and prompt logprobs are incorporated into the EngineCoreOutput data structure which is transferred to the engine client. If multiprocessing is enabled, then sample and prompt logprobs will be (de)serialized when the EngineCoreOutput data structure is (de)serialized.
- During output processing, the LogprobsProcessor transforms the triplet of token ids, token logprobs values, and token ranks into the OpenAI-compatible List[Dict[token id,Logprob]] format (for sample and prompt logprobs respectively.)
- Each Logprob instance (whether sample- or prompt-) consists of a token's log-probability, rank, and detokenized string representation. Note that logprob detokenization is handled by the LogprobsProcessor not the detokenizer.
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-02-07 10:26:20 -05:00
|
|
|
|
2025-04-17 04:28:32 +02:00
|
|
|
from vllm import envs
|
2025-05-08 01:34:02 -04:00
|
|
|
from vllm.logger import init_logger
|
2025-04-17 04:28:32 +02:00
|
|
|
from vllm.multimodal.inputs import (
|
|
|
|
|
BaseMultiModalField,
|
|
|
|
|
MultiModalBatchedField,
|
|
|
|
|
MultiModalFieldConfig,
|
|
|
|
|
MultiModalFieldElem,
|
|
|
|
|
MultiModalFlatField,
|
|
|
|
|
MultiModalKwargsItem,
|
2025-08-18 17:52:00 +08:00
|
|
|
MultiModalKwargsItems,
|
2025-04-17 04:28:32 +02:00
|
|
|
MultiModalSharedField,
|
|
|
|
|
NestedTensors,
|
|
|
|
|
)
|
2025-11-14 01:11:13 -08:00
|
|
|
from vllm.utils.platform_utils import is_pin_memory_available
|
2025-10-23 00:18:07 -05:00
|
|
|
from vllm.v1.utils import tensor_data
|
2025-04-17 04:28:32 +02:00
|
|
|
|
2025-05-08 01:34:02 -04:00
|
|
|
logger = init_logger(__name__)
|
|
|
|
|
|
2025-04-10 12:23:14 -07:00
|
|
|
CUSTOM_TYPE_PICKLE = 1
|
|
|
|
|
CUSTOM_TYPE_CLOUDPICKLE = 2
|
2025-04-11 17:54:06 -07:00
|
|
|
CUSTOM_TYPE_RAW_VIEW = 3
|
[V1] Logprobs and prompt logprobs support (#9880)
This PR is adding support for sample logprobs & prompt logprobs to vLLM v1.
New behavior:
- During model execution, model runner computes sample logprobs (if user-provided logprobs setting is not None) and prompt logprobs (if user-provided prompt_logprobs setting is not None). For both sample and prompt logprobs, the engine core returns 3 vectors: token ids, token logprob values, token ranks. Ranks reflect tokens' 1-indexed positions in the vocabulary vector after sorting the vocabulary by log probability in descending order.
- In scheduler.update_from_output(), sample and prompt logprobs are incorporated into the EngineCoreOutput data structure which is transferred to the engine client. If multiprocessing is enabled, then sample and prompt logprobs will be (de)serialized when the EngineCoreOutput data structure is (de)serialized.
- During output processing, the LogprobsProcessor transforms the triplet of token ids, token logprobs values, and token ranks into the OpenAI-compatible List[Dict[token id,Logprob]] format (for sample and prompt logprobs respectively.)
- Each Logprob instance (whether sample- or prompt-) consists of a token's log-probability, rank, and detokenized string representation. Note that logprob detokenization is handled by the LogprobsProcessor not the detokenizer.
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-02-07 10:26:20 -05:00
|
|
|
|
2025-04-17 04:28:32 +02:00
|
|
|
# MultiModalField class serialization type map.
|
|
|
|
|
# These need to list all possible field types and match them
|
|
|
|
|
# to factory methods in `MultiModalFieldConfig`.
|
|
|
|
|
MMF_CLASS_TO_FACTORY: dict[type[BaseMultiModalField], str] = {
|
|
|
|
|
MultiModalFlatField: "flat",
|
|
|
|
|
MultiModalSharedField: "shared",
|
|
|
|
|
MultiModalBatchedField: "batched",
|
|
|
|
|
}
|
[V1] Logprobs and prompt logprobs support (#9880)
This PR is adding support for sample logprobs & prompt logprobs to vLLM v1.
New behavior:
- During model execution, model runner computes sample logprobs (if user-provided logprobs setting is not None) and prompt logprobs (if user-provided prompt_logprobs setting is not None). For both sample and prompt logprobs, the engine core returns 3 vectors: token ids, token logprob values, token ranks. Ranks reflect tokens' 1-indexed positions in the vocabulary vector after sorting the vocabulary by log probability in descending order.
- In scheduler.update_from_output(), sample and prompt logprobs are incorporated into the EngineCoreOutput data structure which is transferred to the engine client. If multiprocessing is enabled, then sample and prompt logprobs will be (de)serialized when the EngineCoreOutput data structure is (de)serialized.
- During output processing, the LogprobsProcessor transforms the triplet of token ids, token logprobs values, and token ranks into the OpenAI-compatible List[Dict[token id,Logprob]] format (for sample and prompt logprobs respectively.)
- Each Logprob instance (whether sample- or prompt-) consists of a token's log-probability, rank, and detokenized string representation. Note that logprob detokenization is handled by the LogprobsProcessor not the detokenizer.
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-02-07 10:26:20 -05:00
|
|
|
|
2025-04-10 12:23:14 -07:00
|
|
|
bytestr: TypeAlias = bytes | bytearray | memoryview | zmq.Frame
|
[V1] Logprobs and prompt logprobs support (#9880)
This PR is adding support for sample logprobs & prompt logprobs to vLLM v1.
New behavior:
- During model execution, model runner computes sample logprobs (if user-provided logprobs setting is not None) and prompt logprobs (if user-provided prompt_logprobs setting is not None). For both sample and prompt logprobs, the engine core returns 3 vectors: token ids, token logprob values, token ranks. Ranks reflect tokens' 1-indexed positions in the vocabulary vector after sorting the vocabulary by log probability in descending order.
- In scheduler.update_from_output(), sample and prompt logprobs are incorporated into the EngineCoreOutput data structure which is transferred to the engine client. If multiprocessing is enabled, then sample and prompt logprobs will be (de)serialized when the EngineCoreOutput data structure is (de)serialized.
- During output processing, the LogprobsProcessor transforms the triplet of token ids, token logprobs values, and token ranks into the OpenAI-compatible List[Dict[token id,Logprob]] format (for sample and prompt logprobs respectively.)
- Each Logprob instance (whether sample- or prompt-) consists of a token's log-probability, rank, and detokenized string representation. Note that logprob detokenization is handled by the LogprobsProcessor not the detokenizer.
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-02-07 10:26:20 -05:00
|
|
|
|
|
|
|
|
|
2026-03-21 13:10:20 -07:00
|
|
|
class OOBTensorConsumer(ABC):
|
|
|
|
|
@abstractmethod
|
|
|
|
|
def __call__(self, tensor: torch.Tensor) -> dict | None:
|
|
|
|
|
"""
|
|
|
|
|
Called with tensors for the current message.
|
|
|
|
|
Returns None to reject the tensor (falls back to regular serialization),
|
|
|
|
|
otherwise a dict with arbitrary placeholder data to be included
|
|
|
|
|
in the serialized message.
|
|
|
|
|
"""
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
|
def new_message(self) -> None:
|
|
|
|
|
"""Called at the start of each new encoded message."""
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# dtype, shape, metadata -> tensor
|
|
|
|
|
OOBTensorProvider = Callable[[str, tuple[int, ...], dict], torch.Tensor]
|
|
|
|
|
|
|
|
|
|
|
2025-05-08 12:57:35 -04:00
|
|
|
def _log_insecure_serialization_warning():
|
|
|
|
|
logger.warning_once(
|
|
|
|
|
"Allowing insecure serialization using pickle due to "
|
|
|
|
|
"VLLM_ALLOW_INSECURE_SERIALIZATION=1"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2025-07-31 13:51:37 +01:00
|
|
|
def _typestr(val: Any) -> tuple[str, str] | None:
|
|
|
|
|
if val is None:
|
|
|
|
|
return None
|
|
|
|
|
t = type(val)
|
2025-07-30 18:20:20 +01:00
|
|
|
return t.__module__, t.__qualname__
|
|
|
|
|
|
|
|
|
|
|
2025-09-19 19:02:38 -07:00
|
|
|
def _encode_type_info_recursive(obj: Any) -> Any:
|
|
|
|
|
"""Recursively encode type information for nested structures of
|
|
|
|
|
lists/dicts."""
|
|
|
|
|
if obj is None:
|
|
|
|
|
return None
|
|
|
|
|
if type(obj) is list:
|
|
|
|
|
return [_encode_type_info_recursive(item) for item in obj]
|
|
|
|
|
if type(obj) is dict:
|
|
|
|
|
return {k: _encode_type_info_recursive(v) for k, v in obj.items()}
|
|
|
|
|
return _typestr(obj)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _decode_type_info_recursive(
|
|
|
|
|
type_info: Any, data: Any, convert_fn: Callable[[Sequence[str], Any], Any]
|
|
|
|
|
) -> Any:
|
|
|
|
|
"""Recursively decode type information for nested structures of
|
|
|
|
|
lists/dicts."""
|
|
|
|
|
if type_info is None:
|
|
|
|
|
return data
|
|
|
|
|
if isinstance(type_info, dict):
|
|
|
|
|
assert isinstance(data, dict)
|
|
|
|
|
return {
|
|
|
|
|
k: _decode_type_info_recursive(type_info[k], data[k], convert_fn)
|
|
|
|
|
for k in type_info
|
|
|
|
|
}
|
|
|
|
|
if isinstance(type_info, list) and (
|
|
|
|
|
# Exclude serialized tensors/numpy arrays.
|
|
|
|
|
len(type_info) != 2 or not isinstance(type_info[0], str)
|
|
|
|
|
):
|
|
|
|
|
assert isinstance(data, list)
|
|
|
|
|
return [
|
|
|
|
|
_decode_type_info_recursive(ti, d, convert_fn)
|
|
|
|
|
for ti, d in zip(type_info, data)
|
|
|
|
|
]
|
|
|
|
|
return convert_fn(type_info, data)
|
|
|
|
|
|
|
|
|
|
|
2025-11-14 17:58:01 +01:00
|
|
|
class UtilityResult:
|
|
|
|
|
"""Wrapper for special handling when serializing/deserializing."""
|
|
|
|
|
|
|
|
|
|
def __init__(self, r: Any = None):
|
|
|
|
|
self.result = r
|
|
|
|
|
|
|
|
|
|
|
2025-04-10 12:23:14 -07:00
|
|
|
class MsgpackEncoder:
|
|
|
|
|
"""Encoder with custom torch tensor and numpy array serialization.
|
[V1] Logprobs and prompt logprobs support (#9880)
This PR is adding support for sample logprobs & prompt logprobs to vLLM v1.
New behavior:
- During model execution, model runner computes sample logprobs (if user-provided logprobs setting is not None) and prompt logprobs (if user-provided prompt_logprobs setting is not None). For both sample and prompt logprobs, the engine core returns 3 vectors: token ids, token logprob values, token ranks. Ranks reflect tokens' 1-indexed positions in the vocabulary vector after sorting the vocabulary by log probability in descending order.
- In scheduler.update_from_output(), sample and prompt logprobs are incorporated into the EngineCoreOutput data structure which is transferred to the engine client. If multiprocessing is enabled, then sample and prompt logprobs will be (de)serialized when the EngineCoreOutput data structure is (de)serialized.
- During output processing, the LogprobsProcessor transforms the triplet of token ids, token logprobs values, and token ranks into the OpenAI-compatible List[Dict[token id,Logprob]] format (for sample and prompt logprobs respectively.)
- Each Logprob instance (whether sample- or prompt-) consists of a token's log-probability, rank, and detokenized string representation. Note that logprob detokenization is handled by the LogprobsProcessor not the detokenizer.
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-02-07 10:26:20 -05:00
|
|
|
|
2025-04-10 12:23:14 -07:00
|
|
|
Note that unlike vanilla `msgspec` Encoders, this interface is generally
|
|
|
|
|
not thread-safe when encoding tensors / numpy arrays.
|
2025-04-17 04:28:32 +02:00
|
|
|
|
|
|
|
|
By default, arrays below 256B are serialized inline Larger will get sent
|
|
|
|
|
via dedicated messages. Note that this is a per-tensor limit.
|
2026-03-21 13:10:20 -07:00
|
|
|
|
|
|
|
|
When a ``oob_tensor_consumer`` is provided, tensors (CUDA and CPU) will be
|
|
|
|
|
offered to it for out-of-band handling.
|
2025-04-10 12:23:14 -07:00
|
|
|
"""
|
|
|
|
|
|
2026-03-21 13:10:20 -07:00
|
|
|
def __init__(
|
|
|
|
|
self,
|
|
|
|
|
size_threshold: int | None = None,
|
|
|
|
|
oob_tensor_consumer: OOBTensorConsumer | None = None,
|
|
|
|
|
):
|
2025-04-17 04:28:32 +02:00
|
|
|
if size_threshold is None:
|
|
|
|
|
size_threshold = envs.VLLM_MSGPACK_ZERO_COPY_THRESHOLD
|
2025-04-10 12:23:14 -07:00
|
|
|
self.encoder = msgpack.Encoder(enc_hook=self.enc_hook)
|
|
|
|
|
# This is used as a local stash of buffers that we can then access from
|
|
|
|
|
# our custom `msgspec` hook, `enc_hook`. We don't have a way to
|
|
|
|
|
# pass custom data to the hook otherwise.
|
|
|
|
|
self.aux_buffers: list[bytestr] | None = None
|
2025-04-17 04:28:32 +02:00
|
|
|
self.size_threshold = size_threshold
|
2026-03-21 13:10:20 -07:00
|
|
|
self.oob_tensor_consumer = oob_tensor_consumer
|
2025-05-08 01:34:02 -04:00
|
|
|
if envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
|
2025-05-08 12:57:35 -04:00
|
|
|
_log_insecure_serialization_warning()
|
2025-04-10 12:23:14 -07:00
|
|
|
|
|
|
|
|
def encode(self, obj: Any) -> Sequence[bytestr]:
|
|
|
|
|
try:
|
2026-03-21 13:10:20 -07:00
|
|
|
if self.oob_tensor_consumer is not None:
|
|
|
|
|
self.oob_tensor_consumer.new_message()
|
2025-04-10 12:23:14 -07:00
|
|
|
self.aux_buffers = bufs = [b""]
|
|
|
|
|
bufs[0] = self.encoder.encode(obj)
|
|
|
|
|
# This `bufs` list allows us to collect direct pointers to backing
|
|
|
|
|
# buffers of tensors and np arrays, and return them along with the
|
|
|
|
|
# top-level encoded buffer instead of copying their data into the
|
|
|
|
|
# new buffer.
|
|
|
|
|
return bufs
|
|
|
|
|
finally:
|
|
|
|
|
self.aux_buffers = None
|
|
|
|
|
|
|
|
|
|
def encode_into(self, obj: Any, buf: bytearray) -> Sequence[bytestr]:
|
|
|
|
|
try:
|
2026-03-21 13:10:20 -07:00
|
|
|
if self.oob_tensor_consumer is not None:
|
|
|
|
|
self.oob_tensor_consumer.new_message()
|
2025-04-10 12:23:14 -07:00
|
|
|
self.aux_buffers = [buf]
|
|
|
|
|
bufs = self.aux_buffers
|
|
|
|
|
self.encoder.encode_into(obj, buf)
|
|
|
|
|
return bufs
|
|
|
|
|
finally:
|
|
|
|
|
self.aux_buffers = None
|
|
|
|
|
|
|
|
|
|
def enc_hook(self, obj: Any) -> Any:
|
|
|
|
|
if isinstance(obj, torch.Tensor):
|
2025-04-19 19:28:34 +02:00
|
|
|
return self._encode_tensor(obj)
|
2025-04-10 12:23:14 -07:00
|
|
|
|
|
|
|
|
# Fall back to pickle for object or void kind ndarrays.
|
|
|
|
|
if isinstance(obj, np.ndarray) and obj.dtype.kind not in ("O", "V"):
|
|
|
|
|
return self._encode_ndarray(obj)
|
|
|
|
|
|
2025-05-08 01:34:02 -04:00
|
|
|
if isinstance(obj, slice):
|
|
|
|
|
# We are assuming only int-based values will be used here.
|
|
|
|
|
return tuple(
|
|
|
|
|
int(v) if v is not None else None
|
|
|
|
|
for v in (obj.start, obj.stop, obj.step)
|
2025-10-05 15:06:22 +01:00
|
|
|
)
|
2025-05-08 01:34:02 -04:00
|
|
|
|
2025-08-13 22:18:07 +08:00
|
|
|
if isinstance(obj, MultiModalKwargsItem):
|
|
|
|
|
return self._encode_mm_item(obj)
|
|
|
|
|
|
2025-08-18 17:52:00 +08:00
|
|
|
if isinstance(obj, MultiModalKwargsItems):
|
|
|
|
|
return self._encode_mm_items(obj)
|
|
|
|
|
|
2025-07-30 18:20:20 +01:00
|
|
|
if isinstance(obj, UtilityResult):
|
|
|
|
|
result = obj.result
|
2025-07-31 13:51:37 +01:00
|
|
|
if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
|
2025-07-30 18:20:20 +01:00
|
|
|
return None, result
|
2025-09-19 19:02:38 -07:00
|
|
|
# Since utility results are not strongly typed, we recursively
|
|
|
|
|
# encode type information for nested structures of lists/dicts
|
|
|
|
|
# to help with correct msgspec deserialization.
|
|
|
|
|
return _encode_type_info_recursive(result), result
|
2025-07-30 18:20:20 +01:00
|
|
|
|
2025-05-08 01:34:02 -04:00
|
|
|
if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
|
2025-05-08 12:57:35 -04:00
|
|
|
raise TypeError(
|
|
|
|
|
f"Object of type {type(obj)} is not serializable"
|
|
|
|
|
"Set VLLM_ALLOW_INSECURE_SERIALIZATION=1 to allow "
|
|
|
|
|
"fallback to pickle-based serialization."
|
|
|
|
|
)
|
2025-04-30 12:10:54 -04:00
|
|
|
|
2025-04-10 12:23:14 -07:00
|
|
|
if isinstance(obj, FunctionType):
|
|
|
|
|
# `pickle` is generally faster than cloudpickle, but can have
|
|
|
|
|
# problems serializing methods.
|
|
|
|
|
return msgpack.Ext(CUSTOM_TYPE_CLOUDPICKLE, cloudpickle.dumps(obj))
|
|
|
|
|
|
|
|
|
|
return msgpack.Ext(
|
|
|
|
|
CUSTOM_TYPE_PICKLE, pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL)
|
2025-10-05 15:06:22 +01:00
|
|
|
)
|
2025-04-10 12:23:14 -07:00
|
|
|
|
|
|
|
|
def _encode_ndarray(
|
|
|
|
|
self, obj: np.ndarray
|
|
|
|
|
) -> tuple[str, tuple[int, ...], int | memoryview]:
|
|
|
|
|
assert self.aux_buffers is not None
|
2025-04-17 04:28:32 +02:00
|
|
|
# If the array is non-contiguous, we need to copy it first
|
2025-06-18 23:36:55 +01:00
|
|
|
arr_data = obj.data if obj.flags.c_contiguous else obj.tobytes()
|
2025-04-17 04:28:32 +02:00
|
|
|
if not obj.shape or obj.nbytes < self.size_threshold:
|
2025-04-11 17:54:06 -07:00
|
|
|
# Encode small arrays and scalars inline. Using this extension type
|
|
|
|
|
# ensures we can avoid copying when decoding.
|
|
|
|
|
data = msgpack.Ext(CUSTOM_TYPE_RAW_VIEW, arr_data)
|
2025-04-10 12:23:14 -07:00
|
|
|
else:
|
2025-04-11 17:54:06 -07:00
|
|
|
# Otherwise encode index of backing buffer to avoid copy.
|
2025-04-10 12:23:14 -07:00
|
|
|
data = len(self.aux_buffers)
|
2025-04-11 17:54:06 -07:00
|
|
|
self.aux_buffers.append(arr_data)
|
|
|
|
|
|
2025-04-10 12:23:14 -07:00
|
|
|
# We serialize the ndarray as a tuple of native types.
|
|
|
|
|
# The data is either inlined if small, or an index into a list of
|
|
|
|
|
# backing buffers that we've stashed in `aux_buffers`.
|
|
|
|
|
return obj.dtype.str, obj.shape, data
|
[V1] Logprobs and prompt logprobs support (#9880)
This PR is adding support for sample logprobs & prompt logprobs to vLLM v1.
New behavior:
- During model execution, model runner computes sample logprobs (if user-provided logprobs setting is not None) and prompt logprobs (if user-provided prompt_logprobs setting is not None). For both sample and prompt logprobs, the engine core returns 3 vectors: token ids, token logprob values, token ranks. Ranks reflect tokens' 1-indexed positions in the vocabulary vector after sorting the vocabulary by log probability in descending order.
- In scheduler.update_from_output(), sample and prompt logprobs are incorporated into the EngineCoreOutput data structure which is transferred to the engine client. If multiprocessing is enabled, then sample and prompt logprobs will be (de)serialized when the EngineCoreOutput data structure is (de)serialized.
- During output processing, the LogprobsProcessor transforms the triplet of token ids, token logprobs values, and token ranks into the OpenAI-compatible List[Dict[token id,Logprob]] format (for sample and prompt logprobs respectively.)
- Each Logprob instance (whether sample- or prompt-) consists of a token's log-probability, rank, and detokenized string representation. Note that logprob detokenization is handled by the LogprobsProcessor not the detokenizer.
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-02-07 10:26:20 -05:00
|
|
|
|
2025-04-19 19:28:34 +02:00
|
|
|
def _encode_tensor(
|
|
|
|
|
self, obj: torch.Tensor
|
2026-03-21 13:10:20 -07:00
|
|
|
) -> tuple[str, tuple[int, ...], int | dict | memoryview]:
|
|
|
|
|
oob_consumer = self.oob_tensor_consumer
|
2025-05-29 11:05:20 +01:00
|
|
|
# view the tensor as a contiguous 1D array of bytes
|
2026-03-21 13:10:20 -07:00
|
|
|
if obj.nbytes < self.size_threshold and obj.is_cpu:
|
2025-04-19 19:28:34 +02:00
|
|
|
# Smaller tensors are encoded inline, just like ndarrays.
|
2026-03-21 13:10:20 -07:00
|
|
|
data = msgpack.Ext(CUSTOM_TYPE_RAW_VIEW, tensor_data(obj))
|
|
|
|
|
elif oob_consumer is not None and (data := oob_consumer(obj)) is not None:
|
|
|
|
|
assert isinstance(data, dict)
|
2025-04-19 19:28:34 +02:00
|
|
|
else:
|
|
|
|
|
# Otherwise encode index of backing buffer to avoid copy.
|
2026-03-21 13:10:20 -07:00
|
|
|
assert self.aux_buffers is not None
|
2025-04-19 19:28:34 +02:00
|
|
|
data = len(self.aux_buffers)
|
2026-03-21 13:10:20 -07:00
|
|
|
self.aux_buffers.append(tensor_data(obj))
|
2025-05-28 02:46:21 +01:00
|
|
|
dtype = str(obj.dtype).removeprefix("torch.")
|
2025-04-19 19:28:34 +02:00
|
|
|
return dtype, obj.shape, data
|
|
|
|
|
|
2025-08-18 17:52:00 +08:00
|
|
|
def _encode_mm_items(self, items: MultiModalKwargsItems) -> dict[str, Any]:
|
|
|
|
|
return {
|
|
|
|
|
modality: [self._encode_mm_item(item) for item in itemlist]
|
|
|
|
|
for modality, itemlist in items.items()
|
|
|
|
|
}
|
|
|
|
|
|
2026-01-29 21:32:04 +08:00
|
|
|
def _encode_mm_item(self, item: MultiModalKwargsItem) -> dict[str, Any]:
|
|
|
|
|
return {key: self._encode_mm_field_elem(elem) for key, elem in item.items()}
|
2025-08-13 22:18:07 +08:00
|
|
|
|
|
|
|
|
def _encode_mm_field_elem(self, elem: MultiModalFieldElem) -> dict[str, Any]:
|
|
|
|
|
return {
|
|
|
|
|
"data": (
|
|
|
|
|
None if elem.data is None else self._encode_nested_tensors(elem.data)
|
|
|
|
|
),
|
|
|
|
|
"field": self._encode_mm_field(elem.field),
|
|
|
|
|
}
|
|
|
|
|
|
2025-04-17 04:28:32 +02:00
|
|
|
def _encode_nested_tensors(self, nt: NestedTensors) -> Any:
|
|
|
|
|
if isinstance(nt, torch.Tensor):
|
2025-04-19 19:28:34 +02:00
|
|
|
return self._encode_tensor(nt)
|
2025-04-17 04:28:32 +02:00
|
|
|
if isinstance(nt, (int, float)):
|
|
|
|
|
# Although it violates NestedTensors type, MultiModalKwargs
|
|
|
|
|
# values are sometimes floats.
|
|
|
|
|
return nt
|
|
|
|
|
return [self._encode_nested_tensors(x) for x in nt]
|
|
|
|
|
|
|
|
|
|
def _encode_mm_field(self, field: BaseMultiModalField):
|
|
|
|
|
# Figure out the factory name for the field type.
|
|
|
|
|
name = MMF_CLASS_TO_FACTORY.get(field.__class__)
|
|
|
|
|
if not name:
|
|
|
|
|
raise TypeError(f"Unsupported field type: {field.__class__}")
|
2025-12-06 21:40:02 +08:00
|
|
|
|
2025-04-17 04:28:32 +02:00
|
|
|
# We just need to copy all of the field values in order
|
|
|
|
|
# which will be then used to reconstruct the field.
|
2025-12-06 21:40:02 +08:00
|
|
|
factory_kw = {f.name: getattr(field, f.name) for f in dataclasses.fields(field)}
|
|
|
|
|
return name, factory_kw
|
2025-04-17 04:28:32 +02:00
|
|
|
|
[V1] Logprobs and prompt logprobs support (#9880)
This PR is adding support for sample logprobs & prompt logprobs to vLLM v1.
New behavior:
- During model execution, model runner computes sample logprobs (if user-provided logprobs setting is not None) and prompt logprobs (if user-provided prompt_logprobs setting is not None). For both sample and prompt logprobs, the engine core returns 3 vectors: token ids, token logprob values, token ranks. Ranks reflect tokens' 1-indexed positions in the vocabulary vector after sorting the vocabulary by log probability in descending order.
- In scheduler.update_from_output(), sample and prompt logprobs are incorporated into the EngineCoreOutput data structure which is transferred to the engine client. If multiprocessing is enabled, then sample and prompt logprobs will be (de)serialized when the EngineCoreOutput data structure is (de)serialized.
- During output processing, the LogprobsProcessor transforms the triplet of token ids, token logprobs values, and token ranks into the OpenAI-compatible List[Dict[token id,Logprob]] format (for sample and prompt logprobs respectively.)
- Each Logprob instance (whether sample- or prompt-) consists of a token's log-probability, rank, and detokenized string representation. Note that logprob detokenization is handled by the LogprobsProcessor not the detokenizer.
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-02-07 10:26:20 -05:00
|
|
|
|
|
|
|
|
class MsgpackDecoder:
|
2025-04-10 12:23:14 -07:00
|
|
|
"""Decoder with custom torch tensor and numpy array serialization.
|
|
|
|
|
|
|
|
|
|
Note that unlike vanilla `msgspec` Decoders, this interface is generally
|
|
|
|
|
not thread-safe when encoding tensors / numpy arrays.
|
2026-03-21 13:10:20 -07:00
|
|
|
|
|
|
|
|
``oob_tensor_provider`` must be used when an OOBTensorConsumer is used on the
|
|
|
|
|
encoder side.
|
2025-04-10 12:23:14 -07:00
|
|
|
"""
|
[V1] Logprobs and prompt logprobs support (#9880)
This PR is adding support for sample logprobs & prompt logprobs to vLLM v1.
New behavior:
- During model execution, model runner computes sample logprobs (if user-provided logprobs setting is not None) and prompt logprobs (if user-provided prompt_logprobs setting is not None). For both sample and prompt logprobs, the engine core returns 3 vectors: token ids, token logprob values, token ranks. Ranks reflect tokens' 1-indexed positions in the vocabulary vector after sorting the vocabulary by log probability in descending order.
- In scheduler.update_from_output(), sample and prompt logprobs are incorporated into the EngineCoreOutput data structure which is transferred to the engine client. If multiprocessing is enabled, then sample and prompt logprobs will be (de)serialized when the EngineCoreOutput data structure is (de)serialized.
- During output processing, the LogprobsProcessor transforms the triplet of token ids, token logprobs values, and token ranks into the OpenAI-compatible List[Dict[token id,Logprob]] format (for sample and prompt logprobs respectively.)
- Each Logprob instance (whether sample- or prompt-) consists of a token's log-probability, rank, and detokenized string representation. Note that logprob detokenization is handled by the LogprobsProcessor not the detokenizer.
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-02-07 10:26:20 -05:00
|
|
|
|
2026-03-21 13:10:20 -07:00
|
|
|
def __init__(
|
|
|
|
|
self,
|
|
|
|
|
t: Any | None = None,
|
|
|
|
|
share_mem: bool = True,
|
|
|
|
|
oob_tensor_provider: OOBTensorProvider | None = None,
|
|
|
|
|
):
|
2025-11-14 01:11:13 -08:00
|
|
|
self.share_mem = share_mem
|
|
|
|
|
self.pin_tensors = is_pin_memory_available()
|
2025-02-09 19:35:56 -08:00
|
|
|
args = () if t is None else (t,)
|
2025-04-10 12:23:14 -07:00
|
|
|
self.decoder = msgpack.Decoder(
|
|
|
|
|
*args, ext_hook=self.ext_hook, dec_hook=self.dec_hook
|
|
|
|
|
)
|
|
|
|
|
self.aux_buffers: Sequence[bytestr] = ()
|
2026-03-21 13:10:20 -07:00
|
|
|
self.oob_tensor_provider = oob_tensor_provider
|
2025-05-08 01:34:02 -04:00
|
|
|
if envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
|
2025-05-08 12:57:35 -04:00
|
|
|
_log_insecure_serialization_warning()
|
2025-04-10 12:23:14 -07:00
|
|
|
|
|
|
|
|
def decode(self, bufs: bytestr | Sequence[bytestr]) -> Any:
|
2025-10-09 17:43:53 +08:00
|
|
|
if isinstance(bufs, bytestr): # type: ignore
|
2025-04-10 12:23:14 -07:00
|
|
|
return self.decoder.decode(bufs)
|
|
|
|
|
|
|
|
|
|
self.aux_buffers = bufs
|
|
|
|
|
try:
|
|
|
|
|
return self.decoder.decode(bufs[0])
|
|
|
|
|
finally:
|
|
|
|
|
self.aux_buffers = ()
|
|
|
|
|
|
|
|
|
|
def dec_hook(self, t: type, obj: Any) -> Any:
|
|
|
|
|
# Given native types in `obj`, convert to type `t`.
|
|
|
|
|
if isclass(t):
|
|
|
|
|
if issubclass(t, np.ndarray):
|
|
|
|
|
return self._decode_ndarray(obj)
|
|
|
|
|
if issubclass(t, torch.Tensor):
|
2025-04-19 19:28:34 +02:00
|
|
|
return self._decode_tensor(obj)
|
2025-05-08 01:34:02 -04:00
|
|
|
if t is slice:
|
|
|
|
|
return slice(*obj)
|
2025-08-13 22:18:07 +08:00
|
|
|
if issubclass(t, MultiModalKwargsItem):
|
|
|
|
|
return self._decode_mm_item(obj)
|
2025-08-18 17:52:00 +08:00
|
|
|
if issubclass(t, MultiModalKwargsItems):
|
|
|
|
|
return self._decode_mm_items(obj)
|
2025-07-30 18:20:20 +01:00
|
|
|
if t is UtilityResult:
|
|
|
|
|
return self._decode_utility_result(obj)
|
2025-04-10 12:23:14 -07:00
|
|
|
return obj
|
|
|
|
|
|
2025-07-30 18:20:20 +01:00
|
|
|
def _decode_utility_result(self, obj: Any) -> UtilityResult:
|
|
|
|
|
result_type, result = obj
|
|
|
|
|
if result_type is not None:
|
|
|
|
|
if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
|
|
|
|
|
raise TypeError(
|
|
|
|
|
"VLLM_ALLOW_INSECURE_SERIALIZATION must "
|
|
|
|
|
"be set to use custom utility result types"
|
|
|
|
|
)
|
2025-09-19 19:02:38 -07:00
|
|
|
# Use recursive decoding to handle nested structures
|
|
|
|
|
result = _decode_type_info_recursive(
|
|
|
|
|
result_type, result, self._convert_result
|
|
|
|
|
)
|
2025-07-30 18:20:20 +01:00
|
|
|
return UtilityResult(result)
|
|
|
|
|
|
2025-07-31 13:51:37 +01:00
|
|
|
def _convert_result(self, result_type: Sequence[str], result: Any) -> Any:
|
|
|
|
|
if result_type is None:
|
|
|
|
|
return result
|
2025-07-30 18:20:20 +01:00
|
|
|
mod_name, name = result_type
|
|
|
|
|
mod = importlib.import_module(mod_name)
|
|
|
|
|
result_type = getattr(mod, name)
|
|
|
|
|
return msgspec.convert(result, result_type, dec_hook=self.dec_hook)
|
|
|
|
|
|
2025-04-10 12:23:14 -07:00
|
|
|
def _decode_ndarray(self, arr: Any) -> np.ndarray:
|
|
|
|
|
dtype, shape, data = arr
|
2025-04-19 19:28:34 +02:00
|
|
|
# zero-copy decode. We assume the ndarray will not be kept around,
|
|
|
|
|
# as it now locks the whole received message buffer in memory.
|
|
|
|
|
buffer = self.aux_buffers[data] if isinstance(data, int) else data
|
2025-11-14 01:11:13 -08:00
|
|
|
arr = np.frombuffer(buffer, dtype=dtype)
|
|
|
|
|
if not self.share_mem:
|
|
|
|
|
arr = arr.copy()
|
|
|
|
|
return arr.reshape(shape)
|
2025-04-19 19:28:34 +02:00
|
|
|
|
|
|
|
|
def _decode_tensor(self, arr: Any) -> torch.Tensor:
|
|
|
|
|
dtype, shape, data = arr
|
2026-03-21 13:10:20 -07:00
|
|
|
if isinstance(data, dict):
|
|
|
|
|
assert self.oob_tensor_provider, (
|
|
|
|
|
"Received OOB tensor but tensor provider is not set"
|
|
|
|
|
)
|
|
|
|
|
return self.oob_tensor_provider(dtype, shape, data)
|
|
|
|
|
|
2025-11-14 01:11:13 -08:00
|
|
|
is_aux = isinstance(data, int)
|
|
|
|
|
buffer = self.aux_buffers[data] if is_aux else data
|
|
|
|
|
buffer = buffer if isinstance(buffer, memoryview) else memoryview(buffer)
|
2025-04-19 19:28:34 +02:00
|
|
|
torch_dtype = getattr(torch, dtype)
|
|
|
|
|
assert isinstance(torch_dtype, torch.dtype)
|
2025-11-14 01:11:13 -08:00
|
|
|
if not buffer.nbytes: # torch.frombuffer doesn't like empty buffers
|
2025-05-28 02:46:21 +01:00
|
|
|
assert 0 in shape
|
|
|
|
|
return torch.empty(shape, dtype=torch_dtype)
|
|
|
|
|
# Create uint8 array
|
|
|
|
|
arr = torch.frombuffer(buffer, dtype=torch.uint8)
|
2025-11-14 01:11:13 -08:00
|
|
|
# Clone ensures tensor is backed by pytorch-owned memory for safe
|
|
|
|
|
# future async CPU->GPU transfer.
|
|
|
|
|
# Pin larger tensors for more efficient CPU->GPU transfer.
|
|
|
|
|
if not is_aux:
|
|
|
|
|
arr = arr.clone()
|
|
|
|
|
elif not self.share_mem:
|
|
|
|
|
arr = arr.pin_memory() if self.pin_tensors else arr.clone()
|
2025-04-19 19:28:34 +02:00
|
|
|
# Convert back to proper shape & type
|
2025-05-28 02:46:21 +01:00
|
|
|
return arr.view(torch_dtype).view(shape)
|
2025-04-10 12:23:14 -07:00
|
|
|
|
2025-08-18 17:52:00 +08:00
|
|
|
def _decode_mm_items(self, obj: dict[str, Any]) -> MultiModalKwargsItems:
|
|
|
|
|
return MultiModalKwargsItems(
|
|
|
|
|
{
|
|
|
|
|
modality: [self._decode_mm_item(item) for item in itemlist]
|
|
|
|
|
for modality, itemlist in obj.items()
|
|
|
|
|
}
|
|
|
|
|
)
|
2025-08-08 00:47:07 +08:00
|
|
|
|
2026-01-29 21:32:04 +08:00
|
|
|
def _decode_mm_item(self, obj: dict[str, Any]) -> MultiModalKwargsItem:
|
|
|
|
|
return MultiModalKwargsItem(
|
|
|
|
|
{key: self._decode_mm_field_elem(elem) for key, elem in obj.items()}
|
2025-08-08 00:47:07 +08:00
|
|
|
)
|
|
|
|
|
|
2025-08-13 22:18:07 +08:00
|
|
|
def _decode_mm_field_elem(self, obj: dict[str, Any]) -> MultiModalFieldElem:
|
|
|
|
|
if obj["data"] is not None:
|
|
|
|
|
obj["data"] = self._decode_nested_tensors(obj["data"])
|
|
|
|
|
|
2025-08-08 00:47:07 +08:00
|
|
|
# Reconstruct the field processor using MultiModalFieldConfig
|
2025-12-06 21:40:02 +08:00
|
|
|
factory_meth_name, factory_kw = obj["field"]
|
2025-08-08 00:47:07 +08:00
|
|
|
factory_meth = getattr(MultiModalFieldConfig, factory_meth_name)
|
|
|
|
|
|
|
|
|
|
# Special case: decode the union "slices" field of
|
|
|
|
|
# MultiModalFlatField
|
|
|
|
|
if factory_meth_name == "flat":
|
2025-12-06 21:40:02 +08:00
|
|
|
factory_kw["slices"] = self._decode_nested_slices(factory_kw["slices"])
|
2025-08-08 00:47:07 +08:00
|
|
|
|
2025-12-06 21:40:02 +08:00
|
|
|
obj["field"] = factory_meth("", **factory_kw).field
|
2025-08-08 00:47:07 +08:00
|
|
|
return MultiModalFieldElem(**obj)
|
2025-04-17 04:28:32 +02:00
|
|
|
|
|
|
|
|
def _decode_nested_tensors(self, obj: Any) -> NestedTensors:
|
|
|
|
|
if isinstance(obj, (int, float)):
|
|
|
|
|
# Although it violates NestedTensors type, MultiModalKwargs
|
|
|
|
|
# values are sometimes floats.
|
|
|
|
|
return obj
|
|
|
|
|
if not isinstance(obj, list):
|
|
|
|
|
raise TypeError(f"Unexpected NestedTensors contents: {type(obj)}")
|
|
|
|
|
if obj and isinstance(obj[0], str):
|
2025-04-19 19:28:34 +02:00
|
|
|
return self._decode_tensor(obj)
|
2025-04-17 04:28:32 +02:00
|
|
|
return [self._decode_nested_tensors(x) for x in obj]
|
|
|
|
|
|
2025-05-08 01:34:02 -04:00
|
|
|
def _decode_nested_slices(self, obj: Any) -> Any:
|
|
|
|
|
assert isinstance(obj, (list, tuple))
|
|
|
|
|
if obj and not isinstance(obj[0], (list, tuple)):
|
|
|
|
|
return slice(*obj)
|
|
|
|
|
return [self._decode_nested_slices(x) for x in obj]
|
|
|
|
|
|
2025-04-10 12:23:14 -07:00
|
|
|
def ext_hook(self, code: int, data: memoryview) -> Any:
|
2025-04-11 17:54:06 -07:00
|
|
|
if code == CUSTOM_TYPE_RAW_VIEW:
|
|
|
|
|
return data
|
2025-04-30 12:10:54 -04:00
|
|
|
|
2025-05-08 01:34:02 -04:00
|
|
|
if envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
|
2025-04-30 12:10:54 -04:00
|
|
|
if code == CUSTOM_TYPE_PICKLE:
|
|
|
|
|
return pickle.loads(data)
|
|
|
|
|
if code == CUSTOM_TYPE_CLOUDPICKLE:
|
|
|
|
|
return cloudpickle.loads(data)
|
2025-04-10 12:23:14 -07:00
|
|
|
|
|
|
|
|
raise NotImplementedError(f"Extension type code {code} is not supported")
|
2025-10-27 23:30:38 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def run_method(
|
|
|
|
|
obj: Any,
|
|
|
|
|
method: str | bytes | Callable,
|
|
|
|
|
args: tuple[Any, ...],
|
|
|
|
|
kwargs: dict[str, Any],
|
|
|
|
|
) -> Any:
|
|
|
|
|
"""
|
|
|
|
|
Run a method of an object with the given arguments and keyword arguments.
|
|
|
|
|
If the method is string, it will be converted to a method using getattr.
|
|
|
|
|
If the method is serialized bytes and will be deserialized using
|
|
|
|
|
cloudpickle.
|
|
|
|
|
If the method is a callable, it will be called directly.
|
|
|
|
|
"""
|
|
|
|
|
if isinstance(method, bytes):
|
|
|
|
|
func = partial(cloudpickle.loads(method), obj)
|
|
|
|
|
elif isinstance(method, str):
|
|
|
|
|
try:
|
|
|
|
|
func = getattr(obj, method)
|
|
|
|
|
except AttributeError:
|
|
|
|
|
raise NotImplementedError(
|
|
|
|
|
f"Method {method!r} is not implemented."
|
|
|
|
|
) from None
|
|
|
|
|
else:
|
|
|
|
|
func = partial(method, obj) # type: ignore
|
|
|
|
|
return func(*args, **kwargs)
|
2025-11-14 17:58:01 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class PydanticMsgspecMixin:
|
2026-03-14 17:10:11 +03:00
|
|
|
"""Make a ``msgspec.Struct`` compatible with Pydantic for both
|
|
|
|
|
**validation** (JSON/dict -> Struct) and **serialization**
|
|
|
|
|
(Struct -> JSON-safe dict).
|
|
|
|
|
|
|
|
|
|
Subclasses may set ``__pydantic_msgspec_exclude__`` (a ``set[str]``)
|
|
|
|
|
to list non-underscore field names that should also be stripped from
|
|
|
|
|
serialized output. Fields whose names start with ``_`` are always
|
|
|
|
|
excluded automatically.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# Subclasses can override to exclude additional public-but-internal keys.
|
|
|
|
|
__pydantic_msgspec_exclude__: ClassVar[set[str]] = set()
|
|
|
|
|
|
2025-11-14 17:58:01 +01:00
|
|
|
@classmethod
|
|
|
|
|
def __get_pydantic_core_schema__(
|
|
|
|
|
cls, source_type: Any, handler: GetCoreSchemaHandler
|
|
|
|
|
) -> core_schema.CoreSchema:
|
|
|
|
|
"""
|
|
|
|
|
Make msgspec.Struct compatible with Pydantic, respecting defaults.
|
|
|
|
|
Handle JSON=>msgspec.Struct. Used when exposing msgspec.Struct to the
|
|
|
|
|
API as input or in `/docs`. Note this is cached by Pydantic and not
|
|
|
|
|
called on every validation.
|
|
|
|
|
"""
|
|
|
|
|
msgspec_fields = {f.name: f for f in msgspec.structs.fields(source_type)}
|
|
|
|
|
type_hints = get_type_hints(source_type)
|
|
|
|
|
|
|
|
|
|
# Build the Pydantic typed_dict_field for each msgspec field
|
|
|
|
|
fields = {}
|
|
|
|
|
for name, hint in type_hints.items():
|
2026-03-14 17:10:11 +03:00
|
|
|
if name not in msgspec_fields:
|
|
|
|
|
# Skip ClassVar and other non-struct annotations.
|
|
|
|
|
continue
|
|
|
|
|
# Skip private fields — they are excluded from serialization
|
|
|
|
|
# and should not appear in the generated JSON/OpenAPI schema.
|
|
|
|
|
if name.startswith("_"):
|
|
|
|
|
continue
|
2025-11-14 17:58:01 +01:00
|
|
|
msgspec_field = msgspec_fields[name]
|
|
|
|
|
|
|
|
|
|
# typed_dict_field using the handler to get the schema
|
|
|
|
|
field_schema = handler(hint)
|
|
|
|
|
|
|
|
|
|
# Add default value to the schema.
|
2026-03-14 17:10:11 +03:00
|
|
|
# Mark fields with defaults as not required so the generated
|
|
|
|
|
# JSON Schema stays consistent with ``omit_defaults=True``
|
|
|
|
|
# serialization (fields at their default value may be absent).
|
2025-11-14 17:58:01 +01:00
|
|
|
if msgspec_field.default_factory is not msgspec.NODEFAULT:
|
|
|
|
|
wrapped_schema = core_schema.with_default_schema(
|
|
|
|
|
schema=field_schema,
|
|
|
|
|
default_factory=msgspec_field.default_factory,
|
|
|
|
|
)
|
2026-03-14 17:10:11 +03:00
|
|
|
fields[name] = core_schema.typed_dict_field(
|
|
|
|
|
wrapped_schema, required=False
|
|
|
|
|
)
|
2025-11-14 17:58:01 +01:00
|
|
|
elif msgspec_field.default is not msgspec.NODEFAULT:
|
|
|
|
|
wrapped_schema = core_schema.with_default_schema(
|
|
|
|
|
schema=field_schema,
|
|
|
|
|
default=msgspec_field.default,
|
|
|
|
|
)
|
2026-03-14 17:10:11 +03:00
|
|
|
fields[name] = core_schema.typed_dict_field(
|
|
|
|
|
wrapped_schema, required=False
|
|
|
|
|
)
|
2025-11-14 17:58:01 +01:00
|
|
|
else:
|
|
|
|
|
# No default, so Pydantic will treat it as required
|
|
|
|
|
fields[name] = core_schema.typed_dict_field(field_schema)
|
2026-03-14 17:10:11 +03:00
|
|
|
typed_dict_then_convert = core_schema.no_info_after_validator_function(
|
2025-11-14 17:58:01 +01:00
|
|
|
cls._validate_msgspec,
|
|
|
|
|
core_schema.typed_dict_schema(fields),
|
|
|
|
|
)
|
|
|
|
|
|
2026-03-14 17:10:11 +03:00
|
|
|
# Build a serializer that strips private / excluded fields.
|
|
|
|
|
serializer = core_schema.plain_serializer_function_ser_schema(
|
|
|
|
|
cls._serialize_msgspec,
|
|
|
|
|
info_arg=False,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Accept either an already-constructed msgspec.Struct instance or a
|
|
|
|
|
# JSON/dict-like payload.
|
|
|
|
|
return core_schema.union_schema(
|
|
|
|
|
[
|
|
|
|
|
core_schema.is_instance_schema(source_type),
|
|
|
|
|
typed_dict_then_convert,
|
|
|
|
|
],
|
|
|
|
|
serialization=serializer,
|
|
|
|
|
)
|
|
|
|
|
|
2025-11-14 17:58:01 +01:00
|
|
|
@classmethod
|
|
|
|
|
def _validate_msgspec(cls, value: Any) -> Any:
|
|
|
|
|
"""Validate and convert input to msgspec.Struct instance."""
|
|
|
|
|
if isinstance(value, cls):
|
|
|
|
|
return value
|
|
|
|
|
if isinstance(value, dict):
|
|
|
|
|
return cls(**value)
|
|
|
|
|
return msgspec.convert(value, type=cls)
|
2026-03-14 17:10:11 +03:00
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _serialize_msgspec(value: Any) -> Any:
|
|
|
|
|
"""Serialize a msgspec.Struct to a JSON-compatible dict, stripping
|
|
|
|
|
private (``_``-prefixed) and explicitly excluded fields.
|
|
|
|
|
|
|
|
|
|
Uses ``msgspec.to_builtins`` which respects ``omit_defaults=True``,
|
|
|
|
|
so only fields that differ from their declared defaults are included.
|
|
|
|
|
"""
|
|
|
|
|
raw = msgspec.to_builtins(value)
|
|
|
|
|
if not isinstance(raw, dict):
|
|
|
|
|
return raw
|
|
|
|
|
|
|
|
|
|
exclude: set[str] = cast(
|
|
|
|
|
set[str],
|
|
|
|
|
getattr(type(value), "__pydantic_msgspec_exclude__", set()),
|
|
|
|
|
)
|
|
|
|
|
for key in list(raw):
|
|
|
|
|
if key.startswith("_") or key in exclude:
|
|
|
|
|
del raw[key]
|
|
|
|
|
|
|
|
|
|
return raw
|