2025-02-02 14:58:18 -05:00
|
|
|
|
# SPDX-License-Identifier: Apache-2.0
|
2025-06-03 11:20:17 -07:00
|
|
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
2025-02-02 14:58:18 -05:00
|
|
|
|
|
2024-11-11 18:05:38 -05:00
|
|
|
|
import time
|
2025-08-13 22:18:07 +08:00
|
|
|
|
from collections.abc import Mapping
|
2025-04-29 22:24:57 -03:00
|
|
|
|
from typing import Any, Literal, Optional, Union
|
2024-11-11 18:05:38 -05:00
|
|
|
|
|
2025-03-07 10:19:11 -05:00
|
|
|
|
from vllm.config import VllmConfig
|
2025-04-09 00:45:21 +08:00
|
|
|
|
from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs
|
2025-03-28 01:36:32 +08:00
|
|
|
|
from vllm.inputs.parse import split_enc_dec_inputs
|
2024-11-11 18:05:38 -05:00
|
|
|
|
from vllm.inputs.preprocess import InputPreprocessor
|
|
|
|
|
|
from vllm.lora.request import LoRARequest
|
2025-08-13 22:18:07 +08:00
|
|
|
|
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
|
2025-08-27 14:19:13 +08:00
|
|
|
|
from vllm.multimodal.cache import processor_cache_from_config
|
2025-09-09 21:36:09 -07:00
|
|
|
|
from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalUUIDDict
|
2025-04-09 00:45:21 +08:00
|
|
|
|
from vllm.multimodal.processing import EncDecMultiModalProcessor
|
2025-08-13 22:18:07 +08:00
|
|
|
|
from vllm.multimodal.utils import argsort_mm_positions
|
2024-11-11 18:05:38 -05:00
|
|
|
|
from vllm.pooling_params import PoolingParams
|
|
|
|
|
|
from vllm.sampling_params import SamplingParams
|
2025-04-24 12:43:56 +01:00
|
|
|
|
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
|
2024-12-28 15:51:57 -05:00
|
|
|
|
from vllm.v1.engine import EngineCoreRequest
|
2025-03-25 00:02:33 -04:00
|
|
|
|
from vllm.v1.structured_output.backend_guidance import (
|
|
|
|
|
|
validate_guidance_grammar)
|
2025-08-25 05:31:22 +03:00
|
|
|
|
from vllm.v1.structured_output.backend_lm_format_enforcer import (
|
|
|
|
|
|
validate_structured_output_request_lm_format_enforcer)
|
2025-07-10 14:30:26 -05:00
|
|
|
|
from vllm.v1.structured_output.backend_outlines import (
|
|
|
|
|
|
validate_structured_output_request_outlines)
|
2025-04-16 17:01:36 +08:00
|
|
|
|
from vllm.v1.structured_output.backend_xgrammar import (
|
|
|
|
|
|
validate_xgrammar_grammar)
|
2024-11-11 18:05:38 -05:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Processor:
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
|
|
self,
|
2025-03-07 10:19:11 -05:00
|
|
|
|
vllm_config: VllmConfig,
|
2025-04-24 12:43:56 +01:00
|
|
|
|
tokenizer: TokenizerGroup,
|
2024-11-13 20:39:03 +08:00
|
|
|
|
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
|
2024-11-11 18:05:38 -05:00
|
|
|
|
):
|
|
|
|
|
|
|
2025-03-07 10:19:11 -05:00
|
|
|
|
self.vllm_config = vllm_config
|
|
|
|
|
|
self.model_config = vllm_config.model_config
|
|
|
|
|
|
self.cache_config = vllm_config.cache_config
|
|
|
|
|
|
self.lora_config = vllm_config.lora_config
|
|
|
|
|
|
self.decoding_config = vllm_config.decoding_config
|
2024-11-11 18:05:38 -05:00
|
|
|
|
self.tokenizer = tokenizer
|
|
|
|
|
|
|
2025-03-07 10:19:11 -05:00
|
|
|
|
self.generation_config_fields = (
|
|
|
|
|
|
self.model_config.try_get_generation_config())
|
2024-12-11 19:55:30 -05:00
|
|
|
|
|
2025-08-27 14:19:13 +08:00
|
|
|
|
self.mm_registry = mm_registry
|
|
|
|
|
|
self.mm_processor_cache = processor_cache_from_config(
|
|
|
|
|
|
vllm_config, mm_registry)
|
2024-12-03 05:33:10 -05:00
|
|
|
|
|
2025-08-27 14:19:13 +08:00
|
|
|
|
self.input_preprocessor = InputPreprocessor(
|
|
|
|
|
|
self.model_config,
|
|
|
|
|
|
self.tokenizer,
|
|
|
|
|
|
mm_registry,
|
|
|
|
|
|
mm_processor_cache=self.mm_processor_cache,
|
|
|
|
|
|
)
|
2025-05-13 13:40:19 +08:00
|
|
|
|
|
[V1] Logprobs and prompt logprobs support (#9880)
This PR is adding support for sample logprobs & prompt logprobs to vLLM v1.
New behavior:
- During model execution, model runner computes sample logprobs (if user-provided logprobs setting is not None) and prompt logprobs (if user-provided prompt_logprobs setting is not None). For both sample and prompt logprobs, the engine core returns 3 vectors: token ids, token logprob values, token ranks. Ranks reflect tokens' 1-indexed positions in the vocabulary vector after sorting the vocabulary by log probability in descending order.
- In scheduler.update_from_output(), sample and prompt logprobs are incorporated into the EngineCoreOutput data structure which is transferred to the engine client. If multiprocessing is enabled, then sample and prompt logprobs will be (de)serialized when the EngineCoreOutput data structure is (de)serialized.
- During output processing, the LogprobsProcessor transforms the triplet of token ids, token logprobs values, and token ranks into the OpenAI-compatible List[Dict[token id,Logprob]] format (for sample and prompt logprobs respectively.)
- Each Logprob instance (whether sample- or prompt-) consists of a token's log-probability, rank, and detokenized string representation. Note that logprob detokenization is handled by the LogprobsProcessor not the detokenizer.
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-02-07 10:26:20 -05:00
|
|
|
|
def _validate_logprobs(
|
|
|
|
|
|
self,
|
2025-03-05 14:18:55 +00:00
|
|
|
|
params: SamplingParams,
|
[V1] Logprobs and prompt logprobs support (#9880)
This PR is adding support for sample logprobs & prompt logprobs to vLLM v1.
New behavior:
- During model execution, model runner computes sample logprobs (if user-provided logprobs setting is not None) and prompt logprobs (if user-provided prompt_logprobs setting is not None). For both sample and prompt logprobs, the engine core returns 3 vectors: token ids, token logprob values, token ranks. Ranks reflect tokens' 1-indexed positions in the vocabulary vector after sorting the vocabulary by log probability in descending order.
- In scheduler.update_from_output(), sample and prompt logprobs are incorporated into the EngineCoreOutput data structure which is transferred to the engine client. If multiprocessing is enabled, then sample and prompt logprobs will be (de)serialized when the EngineCoreOutput data structure is (de)serialized.
- During output processing, the LogprobsProcessor transforms the triplet of token ids, token logprobs values, and token ranks into the OpenAI-compatible List[Dict[token id,Logprob]] format (for sample and prompt logprobs respectively.)
- Each Logprob instance (whether sample- or prompt-) consists of a token's log-probability, rank, and detokenized string representation. Note that logprob detokenization is handled by the LogprobsProcessor not the detokenizer.
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-02-07 10:26:20 -05:00
|
|
|
|
) -> None:
|
|
|
|
|
|
max_logprobs = self.model_config.max_logprobs
|
2025-08-04 03:04:12 -07:00
|
|
|
|
if max_logprobs == -1:
|
2025-09-07 19:34:31 -07:00
|
|
|
|
max_logprobs = self.model_config.get_vocab_size()
|
|
|
|
|
|
|
[V1] Logprobs and prompt logprobs support (#9880)
This PR is adding support for sample logprobs & prompt logprobs to vLLM v1.
New behavior:
- During model execution, model runner computes sample logprobs (if user-provided logprobs setting is not None) and prompt logprobs (if user-provided prompt_logprobs setting is not None). For both sample and prompt logprobs, the engine core returns 3 vectors: token ids, token logprob values, token ranks. Ranks reflect tokens' 1-indexed positions in the vocabulary vector after sorting the vocabulary by log probability in descending order.
- In scheduler.update_from_output(), sample and prompt logprobs are incorporated into the EngineCoreOutput data structure which is transferred to the engine client. If multiprocessing is enabled, then sample and prompt logprobs will be (de)serialized when the EngineCoreOutput data structure is (de)serialized.
- During output processing, the LogprobsProcessor transforms the triplet of token ids, token logprobs values, and token ranks into the OpenAI-compatible List[Dict[token id,Logprob]] format (for sample and prompt logprobs respectively.)
- Each Logprob instance (whether sample- or prompt-) consists of a token's log-probability, rank, and detokenized string representation. Note that logprob detokenization is handled by the LogprobsProcessor not the detokenizer.
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-02-07 10:26:20 -05:00
|
|
|
|
# Validate sample logprobs.
|
2025-09-07 19:34:31 -07:00
|
|
|
|
if params.logprobs:
|
|
|
|
|
|
num_logprobs = params.logprobs
|
|
|
|
|
|
if num_logprobs == -1:
|
|
|
|
|
|
num_logprobs = self.model_config.get_vocab_size()
|
|
|
|
|
|
if num_logprobs > max_logprobs:
|
|
|
|
|
|
raise ValueError(
|
|
|
|
|
|
f"Requested sample logprobs of {num_logprobs}, "
|
|
|
|
|
|
f"which is is greater than max allowed: {max_logprobs}")
|
[V1] Logprobs and prompt logprobs support (#9880)
This PR is adding support for sample logprobs & prompt logprobs to vLLM v1.
New behavior:
- During model execution, model runner computes sample logprobs (if user-provided logprobs setting is not None) and prompt logprobs (if user-provided prompt_logprobs setting is not None). For both sample and prompt logprobs, the engine core returns 3 vectors: token ids, token logprob values, token ranks. Ranks reflect tokens' 1-indexed positions in the vocabulary vector after sorting the vocabulary by log probability in descending order.
- In scheduler.update_from_output(), sample and prompt logprobs are incorporated into the EngineCoreOutput data structure which is transferred to the engine client. If multiprocessing is enabled, then sample and prompt logprobs will be (de)serialized when the EngineCoreOutput data structure is (de)serialized.
- During output processing, the LogprobsProcessor transforms the triplet of token ids, token logprobs values, and token ranks into the OpenAI-compatible List[Dict[token id,Logprob]] format (for sample and prompt logprobs respectively.)
- Each Logprob instance (whether sample- or prompt-) consists of a token's log-probability, rank, and detokenized string representation. Note that logprob detokenization is handled by the LogprobsProcessor not the detokenizer.
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-02-07 10:26:20 -05:00
|
|
|
|
|
|
|
|
|
|
# Validate prompt logprobs.
|
2025-09-07 19:34:31 -07:00
|
|
|
|
if params.prompt_logprobs:
|
|
|
|
|
|
num_prompt_logprobs = params.prompt_logprobs
|
|
|
|
|
|
if num_prompt_logprobs == -1:
|
|
|
|
|
|
num_prompt_logprobs = self.model_config.get_vocab_size()
|
|
|
|
|
|
if num_prompt_logprobs > max_logprobs:
|
|
|
|
|
|
raise ValueError(
|
|
|
|
|
|
f"Requested prompt logprobs of {num_prompt_logprobs}, "
|
|
|
|
|
|
f"which is is greater than max allowed: {max_logprobs}")
|
[V1] Logprobs and prompt logprobs support (#9880)
This PR is adding support for sample logprobs & prompt logprobs to vLLM v1.
New behavior:
- During model execution, model runner computes sample logprobs (if user-provided logprobs setting is not None) and prompt logprobs (if user-provided prompt_logprobs setting is not None). For both sample and prompt logprobs, the engine core returns 3 vectors: token ids, token logprob values, token ranks. Ranks reflect tokens' 1-indexed positions in the vocabulary vector after sorting the vocabulary by log probability in descending order.
- In scheduler.update_from_output(), sample and prompt logprobs are incorporated into the EngineCoreOutput data structure which is transferred to the engine client. If multiprocessing is enabled, then sample and prompt logprobs will be (de)serialized when the EngineCoreOutput data structure is (de)serialized.
- During output processing, the LogprobsProcessor transforms the triplet of token ids, token logprobs values, and token ranks into the OpenAI-compatible List[Dict[token id,Logprob]] format (for sample and prompt logprobs respectively.)
- Each Logprob instance (whether sample- or prompt-) consists of a token's log-probability, rank, and detokenized string representation. Note that logprob detokenization is handled by the LogprobsProcessor not the detokenizer.
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-02-07 10:26:20 -05:00
|
|
|
|
|
2025-03-05 14:18:55 +00:00
|
|
|
|
def _validate_sampling_params(
|
2025-02-21 22:13:05 -08:00
|
|
|
|
self,
|
2025-03-05 14:18:55 +00:00
|
|
|
|
params: SamplingParams,
|
2025-05-11 03:53:58 -04:00
|
|
|
|
lora_request: Optional[LoRARequest],
|
2025-02-21 22:13:05 -08:00
|
|
|
|
) -> None:
|
2025-03-07 10:19:11 -05:00
|
|
|
|
self._validate_structured_output(params)
|
2025-04-12 21:19:19 +01:00
|
|
|
|
self._validate_logit_bias(params)
|
2025-03-07 10:19:11 -05:00
|
|
|
|
|
2025-02-21 22:13:05 -08:00
|
|
|
|
if params.allowed_token_ids is None:
|
|
|
|
|
|
return
|
2025-03-05 00:49:44 -08:00
|
|
|
|
if not params.allowed_token_ids:
|
|
|
|
|
|
raise ValueError("allowed_token_ids is not None and empty!")
|
2025-08-02 01:09:36 +08:00
|
|
|
|
if self.tokenizer is None:
|
|
|
|
|
|
# When skip_tokenizer_init=True, we can't validate token IDs
|
|
|
|
|
|
# Skip validation and let the model handle invalid tokens
|
|
|
|
|
|
return
|
2025-05-11 03:53:58 -04:00
|
|
|
|
tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)
|
|
|
|
|
|
vocab_size = len(tokenizer)
|
2025-03-05 00:49:44 -08:00
|
|
|
|
if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids):
|
2025-02-21 22:13:05 -08:00
|
|
|
|
raise ValueError(
|
2025-03-05 00:49:44 -08:00
|
|
|
|
"allowed_token_ids contains out-of-vocab token id!")
|
2025-02-21 22:13:05 -08:00
|
|
|
|
|
2025-04-12 21:19:19 +01:00
|
|
|
|
def _validate_logit_bias(
|
|
|
|
|
|
self,
|
|
|
|
|
|
params: SamplingParams,
|
|
|
|
|
|
) -> None:
|
|
|
|
|
|
"""Validate logit_bias token IDs are within vocabulary range."""
|
|
|
|
|
|
if not params.logit_bias:
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
vocab_size = self.model_config.get_vocab_size()
|
|
|
|
|
|
invalid_token_ids = []
|
|
|
|
|
|
|
|
|
|
|
|
for token_id in params.logit_bias:
|
|
|
|
|
|
if token_id < 0 or token_id >= vocab_size:
|
|
|
|
|
|
invalid_token_ids.append(token_id)
|
|
|
|
|
|
|
|
|
|
|
|
if invalid_token_ids:
|
|
|
|
|
|
raise ValueError(
|
|
|
|
|
|
f"token_id(s) {invalid_token_ids} in logit_bias contain "
|
|
|
|
|
|
f"out-of-vocab token ids. Vocabulary size: {vocab_size}")
|
|
|
|
|
|
|
2025-03-05 14:18:55 +00:00
|
|
|
|
def _validate_supported_sampling_params(
|
|
|
|
|
|
self,
|
|
|
|
|
|
params: SamplingParams,
|
|
|
|
|
|
) -> None:
|
2025-03-06 17:34:22 +01:00
|
|
|
|
# Best of not yet supported.
|
|
|
|
|
|
if params.best_of is not None and params.best_of > 1:
|
2025-03-10 17:36:21 +01:00
|
|
|
|
raise ValueError("vLLM V1 does not yet support best_of.")
|
2025-03-05 14:18:55 +00:00
|
|
|
|
# Logits processors not supported.
|
|
|
|
|
|
if params.logits_processors:
|
2025-03-10 17:36:21 +01:00
|
|
|
|
raise ValueError("vLLM V1 does not support per request "
|
2025-03-05 14:18:55 +00:00
|
|
|
|
"user provided logits processors.")
|
|
|
|
|
|
|
|
|
|
|
|
def _validate_params(
|
|
|
|
|
|
self,
|
|
|
|
|
|
params: Union[SamplingParams, PoolingParams],
|
2025-05-11 03:53:58 -04:00
|
|
|
|
lora_request: Optional[LoRARequest],
|
2025-03-05 14:18:55 +00:00
|
|
|
|
):
|
|
|
|
|
|
"""
|
|
|
|
|
|
Validate supported SamplingParam.
|
|
|
|
|
|
Should raise ValueError if unsupported for API Server.
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
2025-06-19 01:36:33 -03:00
|
|
|
|
if isinstance(params, PoolingParams):
|
|
|
|
|
|
return
|
2025-03-05 14:18:55 +00:00
|
|
|
|
|
|
|
|
|
|
self._validate_logprobs(params)
|
2025-05-11 03:53:58 -04:00
|
|
|
|
self._validate_sampling_params(params, lora_request)
|
2025-03-05 14:18:55 +00:00
|
|
|
|
self._validate_supported_sampling_params(params)
|
|
|
|
|
|
|
2025-08-30 18:01:22 -07:00
|
|
|
|
def _validate_multi_modal_uuids(self, prompt: PromptType) -> None:
|
|
|
|
|
|
"""
|
|
|
|
|
|
Validate that user-provided multi_modal_uuids align with
|
|
|
|
|
|
multi_modal_data in the incoming request prompt(s).
|
|
|
|
|
|
Only checks lengths; `None` entries are allowed and will be
|
|
|
|
|
|
auto-hashed downstream.
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
def _validate_single_prompt(single_prompt: Union[dict, str]) -> None:
|
|
|
|
|
|
if not isinstance(single_prompt, dict):
|
|
|
|
|
|
return
|
|
|
|
|
|
mm_data = single_prompt.get("multi_modal_data")
|
|
|
|
|
|
mm_uuids = single_prompt.get("multi_modal_uuids")
|
|
|
|
|
|
if not mm_data or not mm_uuids:
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
for modality, items in mm_data.items():
|
|
|
|
|
|
if modality in mm_uuids:
|
|
|
|
|
|
data_len = len(items) if isinstance(items, list) else 1
|
|
|
|
|
|
uuid_len = len(mm_uuids[modality]) if isinstance(
|
|
|
|
|
|
mm_uuids[modality], list) else 1
|
|
|
|
|
|
if uuid_len != data_len:
|
|
|
|
|
|
raise ValueError(
|
|
|
|
|
|
f"multi_modal_uuids for modality '{modality}' "
|
|
|
|
|
|
"must have same length as data: got "
|
|
|
|
|
|
f"{uuid_len} uuids vs "
|
|
|
|
|
|
f"{data_len} items.")
|
|
|
|
|
|
else:
|
|
|
|
|
|
raise ValueError(
|
|
|
|
|
|
f"multi_modal_uuids for modality '{modality}' must "
|
|
|
|
|
|
"be provided if multi_modal_data is provided.")
|
|
|
|
|
|
|
|
|
|
|
|
# Handle explicit encoder/decoder prompts or singleton prompt
|
|
|
|
|
|
if isinstance(prompt, dict) and "encoder_prompt" in prompt:
|
|
|
|
|
|
enc = prompt.get("encoder_prompt")
|
|
|
|
|
|
dec = prompt.get("decoder_prompt")
|
|
|
|
|
|
if enc is not None:
|
|
|
|
|
|
_validate_single_prompt(enc)
|
|
|
|
|
|
if dec is not None:
|
|
|
|
|
|
_validate_single_prompt(dec)
|
|
|
|
|
|
else:
|
|
|
|
|
|
_validate_single_prompt(prompt) # type: ignore[arg-type]
|
|
|
|
|
|
|
2025-03-05 14:18:55 +00:00
|
|
|
|
def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None:
|
|
|
|
|
|
if lora_request is not None and not self.lora_config:
|
|
|
|
|
|
raise ValueError(f"Got lora_request {lora_request} but LoRA is "
|
|
|
|
|
|
"not enabled!")
|
|
|
|
|
|
|
2025-03-07 10:19:11 -05:00
|
|
|
|
def _validate_structured_output(self, params: SamplingParams) -> None:
|
|
|
|
|
|
if not params.guided_decoding or not self.decoding_config:
|
|
|
|
|
|
return
|
2025-03-18 15:49:15 -04:00
|
|
|
|
|
2025-07-04 03:05:49 -04:00
|
|
|
|
if self.model_config.skip_tokenizer_init and params.guided_decoding:
|
|
|
|
|
|
raise ValueError(
|
|
|
|
|
|
"Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'" # noqa: E501
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2025-04-29 20:02:23 +01:00
|
|
|
|
engine_level_backend = self.decoding_config.backend
|
2025-03-18 15:49:15 -04:00
|
|
|
|
if params.guided_decoding.backend:
|
2025-04-22 14:02:20 +08:00
|
|
|
|
# Request-level backend selection is not supported in V1.
|
|
|
|
|
|
# The values may differ if `params` is reused and was set
|
|
|
|
|
|
# to a specific backend based on `auto` behavior in a previous
|
|
|
|
|
|
# request. We remember that it was set as a result of `auto`
|
|
|
|
|
|
# using the `_auto` option set on the backend in the params.
|
|
|
|
|
|
if (params.guided_decoding.backend != engine_level_backend
|
2025-04-29 20:02:23 +01:00
|
|
|
|
and not (engine_level_backend == "auto"
|
|
|
|
|
|
and params.guided_decoding.backend_was_auto)):
|
2025-04-17 01:35:35 +01:00
|
|
|
|
raise ValueError(
|
|
|
|
|
|
"Request-level structured output backend selection is no "
|
|
|
|
|
|
"longer supported. The request specified "
|
|
|
|
|
|
f"'{params.guided_decoding.backend}', but vLLM was "
|
|
|
|
|
|
f"initialised with '{engine_level_backend}'. This error "
|
|
|
|
|
|
"can be resolved by removing backend selection from the "
|
|
|
|
|
|
"request.")
|
2025-03-18 15:49:15 -04:00
|
|
|
|
else:
|
|
|
|
|
|
params.guided_decoding.backend = engine_level_backend
|
2025-04-07 19:06:24 +08:00
|
|
|
|
|
2025-03-25 00:02:33 -04:00
|
|
|
|
# Request content validation
|
2025-06-28 14:50:52 +09:00
|
|
|
|
if (isinstance(params.guided_decoding.choice, list)
|
|
|
|
|
|
and not params.guided_decoding.choice):
|
|
|
|
|
|
# It is invalid for choice to be an empty list
|
|
|
|
|
|
raise ValueError(f"Choice '{params.guided_decoding.choice}' "
|
|
|
|
|
|
"cannot be an empty list")
|
|
|
|
|
|
|
2025-03-28 11:46:45 -04:00
|
|
|
|
if engine_level_backend.startswith("xgrammar"):
|
2025-03-25 00:02:33 -04:00
|
|
|
|
# xgrammar with no fallback
|
2025-04-16 17:01:36 +08:00
|
|
|
|
validate_xgrammar_grammar(params)
|
2025-04-18 13:12:54 +08:00
|
|
|
|
elif engine_level_backend.startswith("guidance"):
|
|
|
|
|
|
# TODO: ideally we would have the LLTokenizer here as Lark syntax
|
|
|
|
|
|
# allows <|special_token|> and similar, see
|
|
|
|
|
|
# https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens
|
|
|
|
|
|
# Without tokenizer these are disallowed in grammars.
|
|
|
|
|
|
validate_guidance_grammar(params, tokenizer=None)
|
2025-07-10 14:30:26 -05:00
|
|
|
|
elif engine_level_backend == "outlines":
|
|
|
|
|
|
# outlines backend
|
|
|
|
|
|
validate_structured_output_request_outlines(params)
|
2025-08-25 05:31:22 +03:00
|
|
|
|
elif engine_level_backend == "lm-format-enforcer":
|
|
|
|
|
|
# lm format enforcer backend
|
|
|
|
|
|
validate_structured_output_request_lm_format_enforcer(params)
|
2025-04-18 13:12:54 +08:00
|
|
|
|
else:
|
|
|
|
|
|
# NOTE: engine_level_backend must be "auto" here, because we have
|
|
|
|
|
|
# checked supported_backends above.
|
2025-03-25 00:02:33 -04:00
|
|
|
|
# "auto" is an opt-in to opinionated behavior where we try to
|
|
|
|
|
|
# choose a backend based on request contents. This is not the
|
|
|
|
|
|
# default as it is less predictable and subject to change
|
|
|
|
|
|
# between releases as feature support changes.
|
|
|
|
|
|
try:
|
2025-04-16 17:01:36 +08:00
|
|
|
|
validate_xgrammar_grammar(params)
|
2025-03-25 00:02:33 -04:00
|
|
|
|
params.guided_decoding.backend = "xgrammar"
|
|
|
|
|
|
except ValueError:
|
2025-05-12 09:06:10 +08:00
|
|
|
|
# The request either failed validation
|
|
|
|
|
|
# or includes some jsonschema feature(s) that
|
2025-03-25 00:02:33 -04:00
|
|
|
|
# are not supported in xgrammar. Fall back to guidance.
|
2025-05-12 09:06:10 +08:00
|
|
|
|
validate_guidance_grammar(params, tokenizer=None)
|
2025-03-25 00:02:33 -04:00
|
|
|
|
params.guided_decoding.backend = "guidance"
|
2025-04-22 14:02:20 +08:00
|
|
|
|
# Remember that this backend was set automatically
|
2025-04-29 20:02:23 +01:00
|
|
|
|
params.guided_decoding.backend_was_auto = True
|
2025-03-25 00:02:33 -04:00
|
|
|
|
|
2025-09-09 21:36:09 -07:00
|
|
|
|
def _maybe_build_mm_uuids(
|
2025-08-27 13:24:31 -07:00
|
|
|
|
self,
|
|
|
|
|
|
request_id: str,
|
|
|
|
|
|
prompt: PromptType,
|
2025-09-09 21:36:09 -07:00
|
|
|
|
) -> Optional[MultiModalUUIDDict]:
|
2025-08-27 13:24:31 -07:00
|
|
|
|
"""Build per-item multimodal hash overrides when enabled. In this case,
|
|
|
|
|
|
multimodal data items are identified by their request id, modality and
|
|
|
|
|
|
index rather than their content.
|
|
|
|
|
|
|
|
|
|
|
|
Returns a dictionary of modality -> list[str] of overrides, or None if
|
|
|
|
|
|
disabled or no multimodal data is present.
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_mm_data(p: PromptType):
|
|
|
|
|
|
if isinstance(p, dict) and "encoder_prompt" in p:
|
|
|
|
|
|
enc = p.get("encoder_prompt")
|
|
|
|
|
|
if isinstance(enc, dict):
|
|
|
|
|
|
return enc.get("multi_modal_data")
|
|
|
|
|
|
return None
|
|
|
|
|
|
if isinstance(p, dict):
|
|
|
|
|
|
return p.get("multi_modal_data")
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
mm_data = _extract_mm_data(prompt)
|
|
|
|
|
|
if not mm_data:
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
2025-09-09 21:36:09 -07:00
|
|
|
|
mm_uuids: MultiModalUUIDDict = {}
|
2025-08-27 13:24:31 -07:00
|
|
|
|
for modality, data in mm_data.items():
|
|
|
|
|
|
n = len(data) if isinstance(data, list) else 1
|
2025-09-09 21:36:09 -07:00
|
|
|
|
mm_uuids[modality] = [
|
2025-08-27 13:24:31 -07:00
|
|
|
|
f"{request_id}-{modality}-{i}" for i in range(n)
|
|
|
|
|
|
]
|
2025-09-09 21:36:09 -07:00
|
|
|
|
return mm_uuids
|
2025-08-27 13:24:31 -07:00
|
|
|
|
|
2024-11-11 18:05:38 -05:00
|
|
|
|
def process_inputs(
|
|
|
|
|
|
self,
|
|
|
|
|
|
request_id: str,
|
|
|
|
|
|
prompt: PromptType,
|
|
|
|
|
|
params: Union[SamplingParams, PoolingParams],
|
2024-12-14 17:54:04 +00:00
|
|
|
|
arrival_time: Optional[float] = None,
|
2024-11-11 18:05:38 -05:00
|
|
|
|
lora_request: Optional[LoRARequest] = None,
|
2025-04-29 22:24:57 -03:00
|
|
|
|
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
2024-11-11 18:05:38 -05:00
|
|
|
|
trace_headers: Optional[Mapping[str, str]] = None,
|
|
|
|
|
|
priority: int = 0,
|
2025-06-04 08:26:47 -07:00
|
|
|
|
data_parallel_rank: Optional[int] = None,
|
2025-04-25 23:41:05 -07:00
|
|
|
|
) -> tuple[Optional[str], EngineCoreRequest]:
|
2024-11-11 18:05:38 -05:00
|
|
|
|
|
2024-12-11 21:36:27 +08:00
|
|
|
|
# TODO(woosuk): Support pooling models.
|
[V1] Logprobs and prompt logprobs support (#9880)
This PR is adding support for sample logprobs & prompt logprobs to vLLM v1.
New behavior:
- During model execution, model runner computes sample logprobs (if user-provided logprobs setting is not None) and prompt logprobs (if user-provided prompt_logprobs setting is not None). For both sample and prompt logprobs, the engine core returns 3 vectors: token ids, token logprob values, token ranks. Ranks reflect tokens' 1-indexed positions in the vocabulary vector after sorting the vocabulary by log probability in descending order.
- In scheduler.update_from_output(), sample and prompt logprobs are incorporated into the EngineCoreOutput data structure which is transferred to the engine client. If multiprocessing is enabled, then sample and prompt logprobs will be (de)serialized when the EngineCoreOutput data structure is (de)serialized.
- During output processing, the LogprobsProcessor transforms the triplet of token ids, token logprobs values, and token ranks into the OpenAI-compatible List[Dict[token id,Logprob]] format (for sample and prompt logprobs respectively.)
- Each Logprob instance (whether sample- or prompt-) consists of a token's log-probability, rank, and detokenized string representation. Note that logprob detokenization is handled by the LogprobsProcessor not the detokenizer.
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-02-07 10:26:20 -05:00
|
|
|
|
self._validate_lora(lora_request)
|
2025-05-11 03:53:58 -04:00
|
|
|
|
self._validate_params(params, lora_request)
|
2025-03-05 14:18:55 +00:00
|
|
|
|
if trace_headers is not None:
|
|
|
|
|
|
raise ValueError("V1 does not support tracing yet.")
|
[V1] Logprobs and prompt logprobs support (#9880)
This PR is adding support for sample logprobs & prompt logprobs to vLLM v1.
New behavior:
- During model execution, model runner computes sample logprobs (if user-provided logprobs setting is not None) and prompt logprobs (if user-provided prompt_logprobs setting is not None). For both sample and prompt logprobs, the engine core returns 3 vectors: token ids, token logprob values, token ranks. Ranks reflect tokens' 1-indexed positions in the vocabulary vector after sorting the vocabulary by log probability in descending order.
- In scheduler.update_from_output(), sample and prompt logprobs are incorporated into the EngineCoreOutput data structure which is transferred to the engine client. If multiprocessing is enabled, then sample and prompt logprobs will be (de)serialized when the EngineCoreOutput data structure is (de)serialized.
- During output processing, the LogprobsProcessor transforms the triplet of token ids, token logprobs values, and token ranks into the OpenAI-compatible List[Dict[token id,Logprob]] format (for sample and prompt logprobs respectively.)
- Each Logprob instance (whether sample- or prompt-) consists of a token's log-probability, rank, and detokenized string representation. Note that logprob detokenization is handled by the LogprobsProcessor not the detokenizer.
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-02-07 10:26:20 -05:00
|
|
|
|
|
2025-06-06 04:03:01 -07:00
|
|
|
|
data_parallel_size = self.vllm_config.parallel_config.data_parallel_size
|
|
|
|
|
|
if data_parallel_rank is not None and not (0 <= data_parallel_rank <
|
|
|
|
|
|
data_parallel_size):
|
|
|
|
|
|
raise ValueError(f"data_parallel_rank {data_parallel_rank} "
|
|
|
|
|
|
f"is out of range [0, {data_parallel_size}).")
|
|
|
|
|
|
|
2024-11-11 18:05:38 -05:00
|
|
|
|
if arrival_time is None:
|
|
|
|
|
|
arrival_time = time.time()
|
|
|
|
|
|
|
2025-08-30 18:01:22 -07:00
|
|
|
|
# Optionally generate multimodal hash overrides to avoid hashing
|
|
|
|
|
|
# multimodal data items by their content as their identifiers.
|
|
|
|
|
|
|
2025-08-27 13:24:31 -07:00
|
|
|
|
# NOTE: when users explicitly turn off BOTH prefix caching and input
|
|
|
|
|
|
# processing caching, no multimodal features or embeddings will be
|
2025-08-30 18:01:22 -07:00
|
|
|
|
# reused across requests, therefore identifying multimodal data items
|
|
|
|
|
|
# by their content is no longer necessary, and we create uuids with
|
|
|
|
|
|
# request id-modality-index as multimodal hash overrides.
|
2025-08-27 13:24:31 -07:00
|
|
|
|
if (self.model_config.multimodal_config and
|
|
|
|
|
|
self.model_config.multimodal_config.mm_processor_cache_gb == 0
|
|
|
|
|
|
and not self.cache_config.enable_prefix_caching):
|
2025-09-09 21:36:09 -07:00
|
|
|
|
mm_uuids = self._maybe_build_mm_uuids(request_id, prompt)
|
2025-08-27 13:24:31 -07:00
|
|
|
|
else:
|
2025-08-30 18:01:22 -07:00
|
|
|
|
# Otherwise, use user-provided uuids as multimodal hash overrides
|
|
|
|
|
|
# if provided.
|
|
|
|
|
|
self._validate_multi_modal_uuids(prompt)
|
|
|
|
|
|
if isinstance(prompt, dict):
|
2025-09-09 21:36:09 -07:00
|
|
|
|
mm_uuids = prompt.get("multi_modal_uuids")
|
2025-08-30 18:01:22 -07:00
|
|
|
|
else:
|
2025-09-09 21:36:09 -07:00
|
|
|
|
mm_uuids = None
|
2025-08-27 13:24:31 -07:00
|
|
|
|
|
2025-02-13 03:43:24 -08:00
|
|
|
|
# Process inputs, which includes:
|
|
|
|
|
|
# 1. Tokenize text prompt, with LoRA request if one exists.
|
|
|
|
|
|
# 2. For multimodal models with a merged preprocessor, preprocess
|
|
|
|
|
|
# multimodal data and expand prompt token ids accordingly.
|
2025-03-17 14:42:06 +08:00
|
|
|
|
processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
|
2024-11-11 18:05:38 -05:00
|
|
|
|
prompt,
|
2025-04-29 22:24:57 -03:00
|
|
|
|
tokenization_kwargs=tokenization_kwargs,
|
2024-11-11 18:05:38 -05:00
|
|
|
|
lora_request=lora_request,
|
2025-09-09 21:36:09 -07:00
|
|
|
|
mm_uuids=mm_uuids,
|
2024-11-11 18:05:38 -05:00
|
|
|
|
)
|
2025-04-16 18:28:42 +02:00
|
|
|
|
from vllm.platforms import current_platform
|
|
|
|
|
|
current_platform.validate_request(
|
|
|
|
|
|
prompt=prompt,
|
|
|
|
|
|
params=params,
|
|
|
|
|
|
processed_inputs=processed_inputs,
|
|
|
|
|
|
)
|
2025-08-30 18:01:22 -07:00
|
|
|
|
|
2025-02-13 03:43:24 -08:00
|
|
|
|
eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
|
|
|
|
|
|
|
2025-03-15 01:02:20 -04:00
|
|
|
|
self._validate_model_inputs(processed_inputs, lora_request)
|
2024-11-11 18:05:38 -05:00
|
|
|
|
|
2025-03-28 01:36:32 +08:00
|
|
|
|
encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
|
2024-11-13 20:39:03 +08:00
|
|
|
|
|
2025-06-19 01:36:33 -03:00
|
|
|
|
sampling_params = None
|
|
|
|
|
|
pooling_params = None
|
|
|
|
|
|
if isinstance(params, SamplingParams):
|
|
|
|
|
|
# TODO: can we avoid cloning here in multiproc case?
|
|
|
|
|
|
sampling_params = params.clone()
|
|
|
|
|
|
# If unset max tokens, then generate up to the max_model_len.
|
|
|
|
|
|
if sampling_params.max_tokens is None:
|
|
|
|
|
|
sampling_params.max_tokens = (
|
|
|
|
|
|
self.model_config.max_model_len -
|
|
|
|
|
|
len(decoder_inputs["prompt_token_ids"]))
|
|
|
|
|
|
sampling_params.update_from_generation_config(
|
|
|
|
|
|
self.generation_config_fields, eos_token_id)
|
2025-08-02 01:09:36 +08:00
|
|
|
|
if self.tokenizer is not None:
|
|
|
|
|
|
sampling_params.update_from_tokenizer(
|
|
|
|
|
|
self.tokenizer.get_lora_tokenizer(lora_request))
|
2025-06-19 01:36:33 -03:00
|
|
|
|
else:
|
|
|
|
|
|
pooling_params = params.clone()
|
2024-11-11 18:05:38 -05:00
|
|
|
|
|
2025-01-06 11:58:16 -08:00
|
|
|
|
# Multimodal related.
|
2025-08-29 03:36:57 -07:00
|
|
|
|
mm_features: Optional[list[MultiModalFeatureSpec]] = None
|
|
|
|
|
|
|
2025-03-28 14:34:34 +08:00
|
|
|
|
if decoder_inputs["type"] == "multimodal":
|
|
|
|
|
|
decoder_mm_inputs = decoder_inputs["mm_kwargs"]
|
2025-08-13 22:18:07 +08:00
|
|
|
|
decoder_mm_positions = decoder_inputs["mm_placeholders"]
|
2025-08-21 07:23:28 -07:00
|
|
|
|
decoder_mm_hashes = decoder_inputs["mm_hashes"]
|
2025-01-06 11:58:16 -08:00
|
|
|
|
|
|
|
|
|
|
# Merge and flatten multimodal placeholders, hashes and inputs
|
|
|
|
|
|
# from dictionaries to lists, and sort them by each item's position
|
|
|
|
|
|
# in the input sequence.
|
2025-08-13 22:18:07 +08:00
|
|
|
|
sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions)
|
|
|
|
|
|
|
2025-08-29 03:36:57 -07:00
|
|
|
|
mm_features = []
|
|
|
|
|
|
for modality, idx in sorted_mm_idxs:
|
|
|
|
|
|
mm_features.append(
|
|
|
|
|
|
MultiModalFeatureSpec(
|
|
|
|
|
|
data=decoder_mm_inputs[modality][idx],
|
|
|
|
|
|
modality=modality,
|
|
|
|
|
|
identifier=decoder_mm_hashes[modality][idx],
|
|
|
|
|
|
mm_position=decoder_mm_positions[modality][idx]))
|
2025-01-06 11:58:16 -08:00
|
|
|
|
|
2025-04-25 23:41:05 -07:00
|
|
|
|
return decoder_inputs.get("prompt"), EngineCoreRequest(
|
2025-01-06 11:58:16 -08:00
|
|
|
|
request_id=request_id,
|
2025-03-28 14:34:34 +08:00
|
|
|
|
prompt_token_ids=decoder_inputs["prompt_token_ids"],
|
2025-08-29 03:36:57 -07:00
|
|
|
|
mm_features=mm_features,
|
2025-01-06 11:58:16 -08:00
|
|
|
|
sampling_params=sampling_params,
|
2025-06-19 01:36:33 -03:00
|
|
|
|
pooling_params=pooling_params,
|
2025-01-06 11:58:16 -08:00
|
|
|
|
eos_token_id=eos_token_id,
|
|
|
|
|
|
arrival_time=arrival_time,
|
|
|
|
|
|
lora_request=lora_request,
|
2025-04-30 14:27:21 +02:00
|
|
|
|
cache_salt=decoder_inputs.get("cache_salt"),
|
2025-06-23 06:18:08 +03:00
|
|
|
|
priority=priority,
|
2025-06-04 08:26:47 -07:00
|
|
|
|
data_parallel_rank=data_parallel_rank,
|
2024-11-13 20:39:03 +08:00
|
|
|
|
)
|
2024-11-11 18:05:38 -05:00
|
|
|
|
|
2025-03-15 01:02:20 -04:00
|
|
|
|
def _validate_model_inputs(self,
|
|
|
|
|
|
inputs: ProcessorInputs,
|
|
|
|
|
|
lora_request: Optional[LoRARequest] = None):
|
2025-03-28 01:36:32 +08:00
|
|
|
|
encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs)
|
|
|
|
|
|
|
2025-04-09 00:45:21 +08:00
|
|
|
|
if encoder_inputs is not None:
|
|
|
|
|
|
self._validate_model_input(encoder_inputs,
|
|
|
|
|
|
lora_request,
|
|
|
|
|
|
prompt_type="encoder")
|
2024-11-13 20:39:03 +08:00
|
|
|
|
|
2025-04-09 00:45:21 +08:00
|
|
|
|
self._validate_model_input(decoder_inputs,
|
|
|
|
|
|
lora_request,
|
|
|
|
|
|
prompt_type="decoder")
|
2024-11-13 20:39:03 +08:00
|
|
|
|
|
2025-04-09 00:45:21 +08:00
|
|
|
|
def _validate_model_input(
|
|
|
|
|
|
self,
|
|
|
|
|
|
prompt_inputs: SingletonInputs,
|
|
|
|
|
|
lora_request: Optional[LoRARequest],
|
|
|
|
|
|
*,
|
|
|
|
|
|
prompt_type: Literal["encoder", "decoder"],
|
|
|
|
|
|
):
|
2025-04-10 14:19:42 +08:00
|
|
|
|
model_config = self.model_config
|
2024-11-11 18:05:38 -05:00
|
|
|
|
|
2025-04-09 00:45:21 +08:00
|
|
|
|
prompt_ids = prompt_inputs["prompt_token_ids"]
|
|
|
|
|
|
if not prompt_ids:
|
2025-04-10 14:19:42 +08:00
|
|
|
|
if prompt_type == "encoder" and model_config.is_multimodal_model:
|
|
|
|
|
|
pass # Mllama may have empty encoder inputs for text-only data
|
|
|
|
|
|
else:
|
|
|
|
|
|
raise ValueError(f"The {prompt_type} prompt cannot be empty")
|
2025-04-09 00:45:21 +08:00
|
|
|
|
|
2025-07-23 19:00:23 +01:00
|
|
|
|
if self.model_config.skip_tokenizer_init:
|
|
|
|
|
|
tokenizer = None
|
|
|
|
|
|
else:
|
|
|
|
|
|
tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)
|
|
|
|
|
|
max_input_id = max(prompt_ids, default=0)
|
2025-08-31 05:57:05 -07:00
|
|
|
|
|
|
|
|
|
|
# NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while
|
|
|
|
|
|
# self.model_config.get_vocab_size() is the model’s vocab size.
|
|
|
|
|
|
# For Qwen3 models, the language model has extra tokens that do
|
|
|
|
|
|
# not exist in the tokenizer, and vice versa for multimodal
|
|
|
|
|
|
# placeholder tokens in some multimodal models.
|
|
|
|
|
|
# See https://github.com/QwenLM/Qwen3/issues/29#issuecomment-1933720399 # noqa: E501
|
|
|
|
|
|
# and https://github.com/vllm-project/vllm/pull/22471#discussion_r2312251421 # noqa: E501
|
|
|
|
|
|
|
|
|
|
|
|
# Here we take the max of the two to determine if a token id is
|
|
|
|
|
|
# truly out-of-vocabulary.
|
|
|
|
|
|
if max_input_id > max(tokenizer.max_token_id,
|
|
|
|
|
|
self.model_config.get_vocab_size() - 1):
|
2025-07-23 19:00:23 +01:00
|
|
|
|
raise ValueError(
|
|
|
|
|
|
f"Token id {max_input_id} is out of vocabulary")
|
2025-04-09 00:45:21 +08:00
|
|
|
|
|
|
|
|
|
|
max_prompt_len = self.model_config.max_model_len
|
2025-04-16 21:33:15 -07:00
|
|
|
|
if len(prompt_ids) > max_prompt_len:
|
2025-04-10 14:19:42 +08:00
|
|
|
|
if prompt_type == "encoder" and model_config.is_multimodal_model:
|
|
|
|
|
|
mm_registry = self.input_preprocessor.mm_registry
|
|
|
|
|
|
mm_processor = mm_registry.create_processor(
|
|
|
|
|
|
model_config,
|
|
|
|
|
|
tokenizer=tokenizer,
|
|
|
|
|
|
)
|
|
|
|
|
|
assert isinstance(mm_processor, EncDecMultiModalProcessor)
|
|
|
|
|
|
|
|
|
|
|
|
if mm_processor.pad_dummy_encoder_prompt:
|
2025-08-24 20:52:24 +08:00
|
|
|
|
return # Skip encoder length check for Whisper and Donut
|
2025-04-10 14:19:42 +08:00
|
|
|
|
|
|
|
|
|
|
if model_config.is_multimodal_model:
|
2025-04-09 00:45:21 +08:00
|
|
|
|
suggestion = (
|
2024-11-11 18:05:38 -05:00
|
|
|
|
"Make sure that `max_model_len` is no smaller than the "
|
|
|
|
|
|
"number of text tokens plus multimodal tokens. For image "
|
|
|
|
|
|
"inputs, the number of image tokens depends on the number "
|
|
|
|
|
|
"of images, and possibly their aspect ratios as well.")
|
2025-04-09 00:45:21 +08:00
|
|
|
|
else:
|
|
|
|
|
|
suggestion = (
|
|
|
|
|
|
"Make sure that `max_model_len` is no smaller than the "
|
|
|
|
|
|
"number of text tokens.")
|
|
|
|
|
|
|
|
|
|
|
|
raise ValueError(
|
|
|
|
|
|
f"The {prompt_type} prompt (length {len(prompt_ids)}) is "
|
|
|
|
|
|
f"longer than the maximum model length of {max_prompt_len}. "
|
|
|
|
|
|
f"{suggestion}")
|
2024-11-11 18:05:38 -05:00
|
|
|
|
|
2024-11-13 20:39:03 +08:00
|
|
|
|
# TODO: Find out how many placeholder tokens are there so we can
|
|
|
|
|
|
# check that chunked prefill does not truncate them
|
|
|
|
|
|
# max_batch_len = self.scheduler_config.max_num_batched_tokens
|
2025-08-27 14:19:13 +08:00
|
|
|
|
|
|
|
|
|
|
def clear_cache(self) -> None:
|
|
|
|
|
|
self.input_preprocessor.clear_cache()
|