vllm/v1/engine/processor.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import time
from collections.abc import Mapping
from typing import Any, Literal, Optional, Union

from vllm.config import VllmConfig
from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs
from vllm.inputs.parse import split_enc_dec_inputs
from vllm.inputs.preprocess import InputPreprocessor
from vllm.lora.request import LoRARequest
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
from vllm.multimodal.cache import processor_cache_from_config
from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalUUIDDict
from vllm.multimodal.processing import EncDecMultiModalProcessor
from vllm.multimodal.utils import argsort_mm_positions
from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.structured_output.backend_guidance import (
    validate_guidance_grammar)
from vllm.v1.structured_output.backend_lm_format_enforcer import (
    validate_structured_output_request_lm_format_enforcer)
from vllm.v1.structured_output.backend_outlines import (
    validate_structured_output_request_outlines)
from vllm.v1.structured_output.backend_xgrammar import (
    validate_xgrammar_grammar)


class Processor:

    def __init__(
        self,
        vllm_config: VllmConfig,
        tokenizer: TokenizerGroup,
        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
    ):

        self.vllm_config = vllm_config
        self.model_config = vllm_config.model_config
        self.cache_config = vllm_config.cache_config
        self.lora_config = vllm_config.lora_config
        self.decoding_config = vllm_config.decoding_config
        self.tokenizer = tokenizer

        self.generation_config_fields = (
            self.model_config.try_get_generation_config())

        self.mm_registry = mm_registry
        self.mm_processor_cache = processor_cache_from_config(
            vllm_config, mm_registry)

        self.input_preprocessor = InputPreprocessor(
            self.model_config,
            self.tokenizer,
            mm_registry,
            mm_processor_cache=self.mm_processor_cache,
        )

    def _validate_logprobs(
        self,
        params: SamplingParams,
    ) -> None:
        max_logprobs = self.model_config.max_logprobs
        if max_logprobs == -1:
            max_logprobs = self.model_config.get_vocab_size()

        # Validate sample logprobs.
        if params.logprobs:
            num_logprobs = params.logprobs
            if num_logprobs == -1:
                num_logprobs = self.model_config.get_vocab_size()
            if num_logprobs > max_logprobs:
                raise ValueError(
                    f"Requested sample logprobs of {num_logprobs}, "
                    f"which is is greater than max allowed: {max_logprobs}")

        # Validate prompt logprobs.
        if params.prompt_logprobs:
            num_prompt_logprobs = params.prompt_logprobs
            if num_prompt_logprobs == -1:
                num_prompt_logprobs = self.model_config.get_vocab_size()
            if num_prompt_logprobs > max_logprobs:
                raise ValueError(
                    f"Requested prompt logprobs of {num_prompt_logprobs}, "
                    f"which is is greater than max allowed: {max_logprobs}")

    def _validate_sampling_params(
        self,
        params: SamplingParams,
        lora_request: Optional[LoRARequest],
    ) -> None:
        self._validate_structured_output(params)
        self._validate_logit_bias(params)

        if params.allowed_token_ids is None:
            return
        if not params.allowed_token_ids:
            raise ValueError("allowed_token_ids is not None and empty!")
        if self.tokenizer is None:
            # When skip_tokenizer_init=True, we can't validate token IDs
            # Skip validation and let the model handle invalid tokens
            return
        tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)
        vocab_size = len(tokenizer)
        if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids):
            raise ValueError(
                "allowed_token_ids contains out-of-vocab token id!")

    def _validate_logit_bias(
        self,
        params: SamplingParams,
    ) -> None:
        """Validate logit_bias token IDs are within vocabulary range."""
        if not params.logit_bias:
            return

        vocab_size = self.model_config.get_vocab_size()
        invalid_token_ids = []

        for token_id in params.logit_bias:
            if token_id < 0 or token_id >= vocab_size:
                invalid_token_ids.append(token_id)

        if invalid_token_ids:
            raise ValueError(
                f"token_id(s) {invalid_token_ids} in logit_bias contain "
                f"out-of-vocab token ids. Vocabulary size: {vocab_size}")

    def _validate_supported_sampling_params(
        self,
        params: SamplingParams,
    ) -> None:
        # Best of not yet supported.
        if params.best_of is not None and params.best_of > 1:
            raise ValueError("vLLM V1 does not yet support best_of.")
        # Logits processors not supported.
        if params.logits_processors:
            raise ValueError("vLLM V1 does not support per request "
                             "user provided logits processors.")

    def _validate_params(
        self,
        params: Union[SamplingParams, PoolingParams],
        lora_request: Optional[LoRARequest],
    ):
        """
        Validate supported SamplingParam.
        Should raise ValueError if unsupported for API Server.
        """

        if isinstance(params, PoolingParams):
            return

        self._validate_logprobs(params)
        self._validate_sampling_params(params, lora_request)
        self._validate_supported_sampling_params(params)

    def _validate_multi_modal_uuids(self, prompt: PromptType) -> None:
        """
        Validate that user-provided multi_modal_uuids align with
        multi_modal_data in the incoming request prompt(s).
        Only checks lengths; `None` entries are allowed and will be 
        auto-hashed downstream.
        """

        def _validate_single_prompt(single_prompt: Union[dict, str]) -> None:
            if not isinstance(single_prompt, dict):
                return
            mm_data = single_prompt.get("multi_modal_data")
            mm_uuids = single_prompt.get("multi_modal_uuids")
            if not mm_data or not mm_uuids:
                return

            for modality, items in mm_data.items():
                if modality in mm_uuids:
                    data_len = len(items) if isinstance(items, list) else 1
                    uuid_len = len(mm_uuids[modality]) if isinstance(
                        mm_uuids[modality], list) else 1
                    if uuid_len != data_len:
                        raise ValueError(
                            f"multi_modal_uuids for modality '{modality}' "
                            "must have same length as data: got "
                            f"{uuid_len} uuids vs "
                            f"{data_len} items.")
                else:
                    raise ValueError(
                        f"multi_modal_uuids for modality '{modality}' must "
                        "be provided if multi_modal_data is provided.")

        # Handle explicit encoder/decoder prompts or singleton prompt
        if isinstance(prompt, dict) and "encoder_prompt" in prompt:
            enc = prompt.get("encoder_prompt")
            dec = prompt.get("decoder_prompt")
            if enc is not None:
                _validate_single_prompt(enc)
            if dec is not None:
                _validate_single_prompt(dec)
        else:
            _validate_single_prompt(prompt)  # type: ignore[arg-type]

    def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None:
        if lora_request is not None and not self.lora_config:
            raise ValueError(f"Got lora_request {lora_request} but LoRA is "
                             "not enabled!")

    def _validate_structured_output(self, params: SamplingParams) -> None:
        if not params.guided_decoding or not self.decoding_config:
            return

        if self.model_config.skip_tokenizer_init and params.guided_decoding:
            raise ValueError(
                "Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'"  # noqa: E501
            )

        engine_level_backend = self.decoding_config.backend
        if params.guided_decoding.backend:
            # Request-level backend selection is not supported in V1.
            # The values may differ if `params` is reused and was set
            # to a specific backend based on `auto` behavior in a previous
            # request. We remember that it was set as a result of `auto`
            # using the `_auto` option set on the backend in the params.
            if (params.guided_decoding.backend != engine_level_backend
                    and not (engine_level_backend == "auto"
                             and params.guided_decoding.backend_was_auto)):
                raise ValueError(
                    "Request-level structured output backend selection is no "
                    "longer supported. The request specified "
                    f"'{params.guided_decoding.backend}', but vLLM was "
                    f"initialised with '{engine_level_backend}'. This error "
                    "can be resolved by removing backend selection from the "
                    "request.")
        else:
            params.guided_decoding.backend = engine_level_backend

        # Request content validation
        if (isinstance(params.guided_decoding.choice, list)
                and not params.guided_decoding.choice):
            # It is invalid for choice to be an empty list
            raise ValueError(f"Choice '{params.guided_decoding.choice}' "
                             "cannot be an empty list")

        if engine_level_backend.startswith("xgrammar"):
            # xgrammar with no fallback
            validate_xgrammar_grammar(params)
        elif engine_level_backend.startswith("guidance"):
            # TODO: ideally we would have the LLTokenizer here as Lark syntax
            # allows <|special_token|> and similar, see
            # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens
            # Without tokenizer these are disallowed in grammars.
            validate_guidance_grammar(params, tokenizer=None)
        elif engine_level_backend == "outlines":
            # outlines backend
            validate_structured_output_request_outlines(params)
        elif engine_level_backend == "lm-format-enforcer":
            # lm format enforcer backend
            validate_structured_output_request_lm_format_enforcer(params)
        else:
            # NOTE: engine_level_backend must be "auto" here, because we have
            # checked supported_backends above.
            # "auto" is an opt-in to opinionated behavior where we try to
            # choose a backend based on request contents. This is not the
            # default as it is less predictable and subject to change
            # between releases as feature support changes.
            try:
                validate_xgrammar_grammar(params)
                params.guided_decoding.backend = "xgrammar"
            except ValueError:
                # The request either failed validation
                # or includes some jsonschema feature(s) that
                # are not supported in xgrammar. Fall back to guidance.
                validate_guidance_grammar(params, tokenizer=None)
                params.guided_decoding.backend = "guidance"
            # Remember that this backend was set automatically
            params.guided_decoding.backend_was_auto = True

    def _maybe_build_mm_uuids(
        self,
        request_id: str,
        prompt: PromptType,
    ) -> Optional[MultiModalUUIDDict]:
        """Build per-item multimodal hash overrides when enabled. In this case,
        multimodal data items are identified by their request id, modality and
        index rather than their content.

        Returns a dictionary of modality -> list[str] of overrides, or None if
        disabled or no multimodal data is present.
        """

        def _extract_mm_data(p: PromptType):
            if isinstance(p, dict) and "encoder_prompt" in p:
                enc = p.get("encoder_prompt")
                if isinstance(enc, dict):
                    return enc.get("multi_modal_data")
                return None
            if isinstance(p, dict):
                return p.get("multi_modal_data")
            return None

        mm_data = _extract_mm_data(prompt)
        if not mm_data:
            return None

        mm_uuids: MultiModalUUIDDict = {}
        for modality, data in mm_data.items():
            n = len(data) if isinstance(data, list) else 1
            mm_uuids[modality] = [
                f"{request_id}-{modality}-{i}" for i in range(n)
            ]
        return mm_uuids

    def process_inputs(
        self,
        request_id: str,
        prompt: PromptType,
        params: Union[SamplingParams, PoolingParams],
        arrival_time: Optional[float] = None,
        lora_request: Optional[LoRARequest] = None,
        tokenization_kwargs: Optional[dict[str, Any]] = None,
        trace_headers: Optional[Mapping[str, str]] = None,
        priority: int = 0,
        data_parallel_rank: Optional[int] = None,
    ) -> tuple[Optional[str], EngineCoreRequest]:

        # TODO(woosuk): Support pooling models.
        self._validate_lora(lora_request)
        self._validate_params(params, lora_request)
        if trace_headers is not None:
            raise ValueError("V1 does not support tracing yet.")

        data_parallel_size = self.vllm_config.parallel_config.data_parallel_size
        if data_parallel_rank is not None and not (0 <= data_parallel_rank <
                                                   data_parallel_size):
            raise ValueError(f"data_parallel_rank {data_parallel_rank} "
                             f"is out of range [0, {data_parallel_size}).")

        if arrival_time is None:
            arrival_time = time.time()

        # Optionally generate multimodal hash overrides to avoid hashing
        # multimodal data items by their content as their identifiers.

        # NOTE: when users explicitly turn off BOTH prefix caching and input
        # processing caching, no multimodal features or embeddings will be
        # reused across requests, therefore identifying multimodal data items
        # by their content is no longer necessary, and we create uuids with
        # request id-modality-index as multimodal hash overrides.
        if (self.model_config.multimodal_config and
                self.model_config.multimodal_config.mm_processor_cache_gb == 0
                and not self.cache_config.enable_prefix_caching):
            mm_uuids = self._maybe_build_mm_uuids(request_id, prompt)
        else:
            # Otherwise, use user-provided uuids as multimodal hash overrides
            # if provided.
            self._validate_multi_modal_uuids(prompt)
            if isinstance(prompt, dict):
                mm_uuids = prompt.get("multi_modal_uuids")
            else:
                mm_uuids = None

        # Process inputs, which includes:
        # 1. Tokenize text prompt, with LoRA request if one exists.
        # 2. For multimodal models with a merged preprocessor, preprocess
        #   multimodal data and expand prompt token ids accordingly.
        processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
            prompt,
            tokenization_kwargs=tokenization_kwargs,
            lora_request=lora_request,
            mm_uuids=mm_uuids,
        )
        from vllm.platforms import current_platform
        current_platform.validate_request(
            prompt=prompt,
            params=params,
            processed_inputs=processed_inputs,
        )

        eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)

        self._validate_model_inputs(processed_inputs, lora_request)

        encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)

        sampling_params = None
        pooling_params = None
        if isinstance(params, SamplingParams):
            # TODO: can we avoid cloning here in multiproc case?
            sampling_params = params.clone()
            # If unset max tokens, then generate up to the max_model_len.
            if sampling_params.max_tokens is None:
                sampling_params.max_tokens = (
                    self.model_config.max_model_len -
                    len(decoder_inputs["prompt_token_ids"]))
            sampling_params.update_from_generation_config(
                self.generation_config_fields, eos_token_id)
            if self.tokenizer is not None:
                sampling_params.update_from_tokenizer(
                    self.tokenizer.get_lora_tokenizer(lora_request))
        else:
            pooling_params = params.clone()

        # Multimodal related.
        mm_features: Optional[list[MultiModalFeatureSpec]] = None

        if decoder_inputs["type"] == "multimodal":
            decoder_mm_inputs = decoder_inputs["mm_kwargs"]
            decoder_mm_positions = decoder_inputs["mm_placeholders"]
            decoder_mm_hashes = decoder_inputs["mm_hashes"]

            # Merge and flatten multimodal placeholders, hashes and inputs
            # from dictionaries to lists, and sort them by each item's position
            # in the input sequence.
            sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions)

            mm_features = []
            for modality, idx in sorted_mm_idxs:
                mm_features.append(
                    MultiModalFeatureSpec(
                        data=decoder_mm_inputs[modality][idx],
                        modality=modality,
                        identifier=decoder_mm_hashes[modality][idx],
                        mm_position=decoder_mm_positions[modality][idx]))

        return decoder_inputs.get("prompt"), EngineCoreRequest(
            request_id=request_id,
            prompt_token_ids=decoder_inputs["prompt_token_ids"],
            mm_features=mm_features,
            sampling_params=sampling_params,
            pooling_params=pooling_params,
            eos_token_id=eos_token_id,
            arrival_time=arrival_time,
            lora_request=lora_request,
            cache_salt=decoder_inputs.get("cache_salt"),
            priority=priority,
            data_parallel_rank=data_parallel_rank,
        )

    def _validate_model_inputs(self,
                               inputs: ProcessorInputs,
                               lora_request: Optional[LoRARequest] = None):
        encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs)

        if encoder_inputs is not None:
            self._validate_model_input(encoder_inputs,
                                       lora_request,
                                       prompt_type="encoder")

        self._validate_model_input(decoder_inputs,
                                   lora_request,
                                   prompt_type="decoder")

    def _validate_model_input(
        self,
        prompt_inputs: SingletonInputs,
        lora_request: Optional[LoRARequest],
        *,
        prompt_type: Literal["encoder", "decoder"],
    ):
        model_config = self.model_config

        prompt_ids = prompt_inputs["prompt_token_ids"]
        if not prompt_ids:
            if prompt_type == "encoder" and model_config.is_multimodal_model:
                pass  # Mllama may have empty encoder inputs for text-only data
            else:
                raise ValueError(f"The {prompt_type} prompt cannot be empty")

        if self.model_config.skip_tokenizer_init:
            tokenizer = None
        else:
            tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)
            max_input_id = max(prompt_ids, default=0)

            # NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while
            # self.model_config.get_vocab_size() is the model’s vocab size.
            # For Qwen3 models, the language model has extra tokens that do
            # not exist in the tokenizer, and vice versa for multimodal
            # placeholder tokens in some multimodal models.
            # See https://github.com/QwenLM/Qwen3/issues/29#issuecomment-1933720399 # noqa: E501
            # and https://github.com/vllm-project/vllm/pull/22471#discussion_r2312251421 # noqa: E501

            # Here we take the max of the two to determine if a token id is
            # truly out-of-vocabulary.
            if max_input_id > max(tokenizer.max_token_id,
                                  self.model_config.get_vocab_size() - 1):
                raise ValueError(
                    f"Token id {max_input_id} is out of vocabulary")

        max_prompt_len = self.model_config.max_model_len
        if len(prompt_ids) > max_prompt_len:
            if prompt_type == "encoder" and model_config.is_multimodal_model:
                mm_registry = self.input_preprocessor.mm_registry
                mm_processor = mm_registry.create_processor(
                    model_config,
                    tokenizer=tokenizer,
                )
                assert isinstance(mm_processor, EncDecMultiModalProcessor)

                if mm_processor.pad_dummy_encoder_prompt:
                    return  # Skip encoder length check for Whisper and Donut

            if model_config.is_multimodal_model:
                suggestion = (
                    "Make sure that `max_model_len` is no smaller than the "
                    "number of text tokens plus multimodal tokens. For image "
                    "inputs, the number of image tokens depends on the number "
                    "of images, and possibly their aspect ratios as well.")
            else:
                suggestion = (
                    "Make sure that `max_model_len` is no smaller than the "
                    "number of text tokens.")

            raise ValueError(
                f"The {prompt_type} prompt (length {len(prompt_ids)}) is "
                f"longer than the maximum model length of {max_prompt_len}. "
                f"{suggestion}")

            # TODO: Find out how many placeholder tokens are there so we can
            # check that chunked prefill does not truncate them
            # max_batch_len = self.scheduler_config.max_num_batched_tokens

    def clear_cache(self) -> None:
        self.input_preprocessor.clear_cache()
-												[Misc] Add SPDX-License-Identifier headers to python source files (#12628)

- **Add SPDX license headers to python source files**
- **Check for SPDX headers using pre-commit**

commit 9d7ef44c3cfb72ca4c32e1c677d99259d10d4745
Author: Russell Bryant <rbryant@redhat.com>
Date:   Fri Jan 31 14:18:24 2025 -0500

    Add SPDX license headers to python source files
    
This commit adds SPDX license headers to python source files as
recommended to
the project by the Linux Foundation. These headers provide a concise way
that is
both human and machine readable for communicating license information
for each
source file. It helps avoid any ambiguity about the license of the code
and can
    also be easily used by tools to help manage license compliance.
    
The Linux Foundation runs license scans against the codebase to help
ensure
    we are in compliance with the licenses of the code we use, including
dependencies. Having these headers in place helps that tool do its job.
    
    More information can be found on the SPDX site:
    
    - https://spdx.dev/learn/handling-license-info/
    
    Signed-off-by: Russell Bryant <rbryant@redhat.com>

commit 5a1cf1cb3b80759131c73f6a9dddebccac039dea
Author: Russell Bryant <rbryant@redhat.com>
Date:   Fri Jan 31 14:36:32 2025 -0500

    Check for SPDX headers using pre-commit
    
    Signed-off-by: Russell Bryant <rbryant@redhat.com>

---------

Signed-off-by: Russell Bryant <rbryant@redhat.com>
											
										
										
											2025-02-02 14:58:18 -05:00
+								# SPDX-License-Identifier: Apache-2.0
-												[Misc] Add SPDX-FileCopyrightText  (#19100)

Signed-off-by: simon-mo <simon.mo@hey.com>
											
										
										
											2025-06-03 11:20:17 -07:00
+								# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-												[Misc] Add SPDX-License-Identifier headers to python source files (#12628)

- **Add SPDX license headers to python source files**
- **Check for SPDX headers using pre-commit**

commit 9d7ef44c3cfb72ca4c32e1c677d99259d10d4745
Author: Russell Bryant <rbryant@redhat.com>
Date:   Fri Jan 31 14:18:24 2025 -0500

    Add SPDX license headers to python source files
    
This commit adds SPDX license headers to python source files as
recommended to
the project by the Linux Foundation. These headers provide a concise way
that is
both human and machine readable for communicating license information
for each
source file. It helps avoid any ambiguity about the license of the code
and can
    also be easily used by tools to help manage license compliance.
    
The Linux Foundation runs license scans against the codebase to help
ensure
    we are in compliance with the licenses of the code we use, including
dependencies. Having these headers in place helps that tool do its job.
    
    More information can be found on the SPDX site:
    
    - https://spdx.dev/learn/handling-license-info/
    
    Signed-off-by: Russell Bryant <rbryant@redhat.com>

commit 5a1cf1cb3b80759131c73f6a9dddebccac039dea
Author: Russell Bryant <rbryant@redhat.com>
Date:   Fri Jan 31 14:36:32 2025 -0500

    Check for SPDX headers using pre-commit
    
    Signed-off-by: Russell Bryant <rbryant@redhat.com>

---------

Signed-off-by: Russell Bryant <rbryant@redhat.com>
											
										
										
											2025-02-02 14:58:18 -05:00
-												[V1] `AsyncLLM` Implementation (#9826)

Signed-off-by: Nick Hill <nickhill@us.ibm.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
											
										
										
											2024-11-11 18:05:38 -05:00
+								import time
-												[Core] Use individual MM items in P0/P1 cache and model runner (#22570)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-08-13 22:18:07 +08:00
+								from collections.abc import Mapping
-												Truncation control for embedding models (#14776)

Signed-off-by: Gabriel Marinho <gmarinho@ibm.com>
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Co-authored-by: Max de Bayser <mbayser@br.ibm.com>
											
										
										
											2025-04-29 22:24:57 -03:00
+								from typing import Any, Literal, Optional, Union
-												[V1] `AsyncLLM` Implementation (#9826)

Signed-off-by: Nick Hill <nickhill@us.ibm.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
											
										
										
											2024-11-11 18:05:38 -05:00
-												[V1][Core] Support for Structured Outputs (#12388)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
											
										
										
											2025-03-07 10:19:11 -05:00
+								from vllm.config import VllmConfig
-												[Bugfix] Proper input validation for multi-modal encoder-decoder models (#16156)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-09 00:45:21 +08:00
+								from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs
-												[Misc] Replace `is_encoder_decoder_inputs` with `split_enc_dec_inputs` (#15620)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-28 01:36:32 +08:00
+								from vllm.inputs.parse import split_enc_dec_inputs
-												[V1] `AsyncLLM` Implementation (#9826)

Signed-off-by: Nick Hill <nickhill@us.ibm.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
											
										
										
											2024-11-11 18:05:38 -05:00
+								from vllm.inputs.preprocess import InputPreprocessor
 								from vllm.lora.request import LoRARequest
-												[Core] Use individual MM items in P0/P1 cache and model runner (#22570)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-08-13 22:18:07 +08:00
+								from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
-												[Core] Use key-only cache for `BaseMultiModalProcessor` (#23018)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-08-27 14:19:13 +08:00
+								from vllm.multimodal.cache import processor_cache_from_config
-												[Core] Simplify and unify mm uuid handling & auto-generated mm hash overrides processing.  (#24271)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
											
										
										
											2025-09-09 21:36:09 -07:00
+								from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalUUIDDict
-												[Bugfix] Proper input validation for multi-modal encoder-decoder models (#16156)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-09 00:45:21 +08:00
+								from vllm.multimodal.processing import EncDecMultiModalProcessor
-												[Core] Use individual MM items in P0/P1 cache and model runner (#22570)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-08-13 22:18:07 +08:00
+								from vllm.multimodal.utils import argsort_mm_positions
-												[V1] `AsyncLLM` Implementation (#9826)

Signed-off-by: Nick Hill <nickhill@us.ibm.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
											
										
										
											2024-11-11 18:05:38 -05:00
+								from vllm.pooling_params import PoolingParams
 								from vllm.sampling_params import SamplingParams
-												Simplify `TokenizerGroup` (#16790)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-04-24 12:43:56 +01:00
+								from vllm.transformers_utils.tokenizer_group import TokenizerGroup
-												[V1] [5/N] API Server: unify `Detokenizer` and  `EngineCore` input (#11545)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
											
										
										
											2024-12-28 15:51:57 -05:00
+								from vllm.v1.engine import EngineCoreRequest
-												[V1] guidance backend for structured output + `auto` fallback mode (#14779)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Loc Huynh <jc1da.3011@gmail.com>
Co-authored-by: Michal Moskal <michal@moskal.me>
											
										
										
											2025-03-25 00:02:33 -04:00
+								from vllm.v1.structured_output.backend_guidance import (
 								    validate_guidance_grammar)
-												Frontend: Adding LM Format Enforcer support to V1 engine (#22564)

Signed-off-by: Noam Gat <noamgat@gmail.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2025-08-25 05:31:22 +03:00
+								from vllm.v1.structured_output.backend_lm_format_enforcer import (
 								    validate_structured_output_request_lm_format_enforcer)
-												[V0][V1][Core] Add outlines integration for V1, and update V0 integration. (#15975)

Signed-off-by: Nathan Hoos <thwackyy.y@gmail.com>
											
										
										
											2025-07-10 14:30:26 -05:00
+								from vllm.v1.structured_output.backend_outlines import (
 								    validate_structured_output_request_outlines)
-												[V1][Structured Output] Move xgrammar related utils to `backend_xgrammar.py` (#16578)

Signed-off-by: shen-shanshan <467638484@qq.com>
											
										
										
											2025-04-16 17:01:36 +08:00
+								from vllm.v1.structured_output.backend_xgrammar import (
 								    validate_xgrammar_grammar)
-												[V1] `AsyncLLM` Implementation (#9826)

Signed-off-by: Nick Hill <nickhill@us.ibm.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
											
										
										
											2024-11-11 18:05:38 -05:00
 								class Processor:
 								    def __init__(
 								        self,
-												[V1][Core] Support for Structured Outputs (#12388)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
											
										
										
											2025-03-07 10:19:11 -05:00
+								        vllm_config: VllmConfig,
-												Simplify `TokenizerGroup` (#16790)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-04-24 12:43:56 +01:00
+								        tokenizer: TokenizerGroup,
-												[1/N] Initial prototype for multi-modal processor (#10044)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-11-13 20:39:03 +08:00
+								        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
-												[V1] `AsyncLLM` Implementation (#9826)

Signed-off-by: Nick Hill <nickhill@us.ibm.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
											
										
										
											2024-11-11 18:05:38 -05:00
+								    ):
-												[V1][Core] Support for Structured Outputs (#12388)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
											
										
										
											2025-03-07 10:19:11 -05:00
+								        self.vllm_config = vllm_config
 								        self.model_config = vllm_config.model_config
 								        self.cache_config = vllm_config.cache_config
 								        self.lora_config = vllm_config.lora_config
 								        self.decoding_config = vllm_config.decoding_config
-												[V1] `AsyncLLM` Implementation (#9826)

Signed-off-by: Nick Hill <nickhill@us.ibm.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
											
										
										
											2024-11-11 18:05:38 -05:00
+								        self.tokenizer = tokenizer
-												[V1][Core] Support for Structured Outputs (#12388)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
											
										
										
											2025-03-07 10:19:11 -05:00
+								        self.generation_config_fields = (
 								            self.model_config.try_get_generation_config())
-												[V1] VLM preprocessor hashing (#11020)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Alexander Matveev <alexm@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-12-11 19:55:30 -05:00
-												[Core] Use key-only cache for `BaseMultiModalProcessor` (#23018)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-08-27 14:19:13 +08:00
+								        self.mm_registry = mm_registry
 								        self.mm_processor_cache = processor_cache_from_config(
 								            vllm_config, mm_registry)
-												[V1] VLM - Run the mm_mapper preprocessor in the frontend process (#10640)

Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-12-03 05:33:10 -05:00
-												[Core] Use key-only cache for `BaseMultiModalProcessor` (#23018)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-08-27 14:19:13 +08:00
+								        self.input_preprocessor = InputPreprocessor(
 								            self.model_config,
 								            self.tokenizer,
 								            mm_registry,
 								            mm_processor_cache=self.mm_processor_cache,
 								        )
-												[Bugfix] Avoid repeatedly creating dummy data during engine startup (#17935)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-13 13:40:19 +08:00
-												[V1] Logprobs and prompt logprobs support (#9880)

This PR is adding support for sample logprobs & prompt logprobs to vLLM v1.

New behavior:

- During model execution, model runner computes sample logprobs (if user-provided logprobs setting is not None) and prompt logprobs (if user-provided prompt_logprobs setting is not None). For both sample and prompt logprobs, the engine core returns 3 vectors: token ids, token logprob values, token ranks. Ranks reflect tokens' 1-indexed positions in the vocabulary vector after sorting the vocabulary by log probability in descending order.
- In scheduler.update_from_output(), sample and prompt logprobs are incorporated into the EngineCoreOutput data structure which is transferred to the engine client. If multiprocessing is enabled, then sample and prompt logprobs will be (de)serialized when the EngineCoreOutput data structure is (de)serialized.
- During output processing, the LogprobsProcessor transforms the triplet of token ids, token logprobs values, and token ranks into the OpenAI-compatible List[Dict[token id,Logprob]] format (for sample and prompt logprobs respectively.)
- Each Logprob instance (whether sample- or prompt-) consists of a token's log-probability, rank, and detokenized string representation. Note that logprob detokenization is handled by the LogprobsProcessor not the detokenizer.

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>


Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
											
										
										
											2025-02-07 10:26:20 -05:00
+								    def _validate_logprobs(
 								        self,
-												[V1][Frontend] Add Testing For V1 Runtime Parameters (#14159)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
											
										
										
											2025-03-05 14:18:55 +00:00
+								        params: SamplingParams,
-												[V1] Logprobs and prompt logprobs support (#9880)

This PR is adding support for sample logprobs & prompt logprobs to vLLM v1.

New behavior:

- During model execution, model runner computes sample logprobs (if user-provided logprobs setting is not None) and prompt logprobs (if user-provided prompt_logprobs setting is not None). For both sample and prompt logprobs, the engine core returns 3 vectors: token ids, token logprob values, token ranks. Ranks reflect tokens' 1-indexed positions in the vocabulary vector after sorting the vocabulary by log probability in descending order.
- In scheduler.update_from_output(), sample and prompt logprobs are incorporated into the EngineCoreOutput data structure which is transferred to the engine client. If multiprocessing is enabled, then sample and prompt logprobs will be (de)serialized when the EngineCoreOutput data structure is (de)serialized.
- During output processing, the LogprobsProcessor transforms the triplet of token ids, token logprobs values, and token ranks into the OpenAI-compatible List[Dict[token id,Logprob]] format (for sample and prompt logprobs respectively.)
- Each Logprob instance (whether sample- or prompt-) consists of a token's log-probability, rank, and detokenized string representation. Note that logprob detokenization is handled by the LogprobsProcessor not the detokenizer.

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>


Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
											
										
										
											2025-02-07 10:26:20 -05:00
+								    ) -> None:
 								        max_logprobs = self.model_config.max_logprobs
-												[Sampler] Support returning all logprobs or logits (#21792)

Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
											
										
										
											2025-08-04 03:04:12 -07:00
+								        if max_logprobs == -1:
-												[Sampler] Support returning all prompt logprobs (#23868)

Signed-off-by: Xingyu Liu <charlotteliu12x@gmail.com>
Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2025-09-07 19:34:31 -07:00
+								            max_logprobs = self.model_config.get_vocab_size()
-												[V1] Logprobs and prompt logprobs support (#9880)

This PR is adding support for sample logprobs & prompt logprobs to vLLM v1.

New behavior:

- During model execution, model runner computes sample logprobs (if user-provided logprobs setting is not None) and prompt logprobs (if user-provided prompt_logprobs setting is not None). For both sample and prompt logprobs, the engine core returns 3 vectors: token ids, token logprob values, token ranks. Ranks reflect tokens' 1-indexed positions in the vocabulary vector after sorting the vocabulary by log probability in descending order.
- In scheduler.update_from_output(), sample and prompt logprobs are incorporated into the EngineCoreOutput data structure which is transferred to the engine client. If multiprocessing is enabled, then sample and prompt logprobs will be (de)serialized when the EngineCoreOutput data structure is (de)serialized.
- During output processing, the LogprobsProcessor transforms the triplet of token ids, token logprobs values, and token ranks into the OpenAI-compatible List[Dict[token id,Logprob]] format (for sample and prompt logprobs respectively.)
- Each Logprob instance (whether sample- or prompt-) consists of a token's log-probability, rank, and detokenized string representation. Note that logprob detokenization is handled by the LogprobsProcessor not the detokenizer.

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>


Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
											
										
										
											2025-02-07 10:26:20 -05:00
+								        # Validate sample logprobs.
-												[Sampler] Support returning all prompt logprobs (#23868)

Signed-off-by: Xingyu Liu <charlotteliu12x@gmail.com>
Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2025-09-07 19:34:31 -07:00
+								        if params.logprobs:
 								            num_logprobs = params.logprobs
 								            if num_logprobs == -1:
 								                num_logprobs = self.model_config.get_vocab_size()
 								            if num_logprobs > max_logprobs:
 								                raise ValueError(
 								                    f"Requested sample logprobs of {num_logprobs}, "
 								                    f"which is is greater than max allowed: {max_logprobs}")
-												[V1] Logprobs and prompt logprobs support (#9880)

This PR is adding support for sample logprobs & prompt logprobs to vLLM v1.

New behavior:

- During model execution, model runner computes sample logprobs (if user-provided logprobs setting is not None) and prompt logprobs (if user-provided prompt_logprobs setting is not None). For both sample and prompt logprobs, the engine core returns 3 vectors: token ids, token logprob values, token ranks. Ranks reflect tokens' 1-indexed positions in the vocabulary vector after sorting the vocabulary by log probability in descending order.
- In scheduler.update_from_output(), sample and prompt logprobs are incorporated into the EngineCoreOutput data structure which is transferred to the engine client. If multiprocessing is enabled, then sample and prompt logprobs will be (de)serialized when the EngineCoreOutput data structure is (de)serialized.
- During output processing, the LogprobsProcessor transforms the triplet of token ids, token logprobs values, and token ranks into the OpenAI-compatible List[Dict[token id,Logprob]] format (for sample and prompt logprobs respectively.)
- Each Logprob instance (whether sample- or prompt-) consists of a token's log-probability, rank, and detokenized string representation. Note that logprob detokenization is handled by the LogprobsProcessor not the detokenizer.

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>


Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
											
										
										
											2025-02-07 10:26:20 -05:00
 								        # Validate prompt logprobs.
-												[Sampler] Support returning all prompt logprobs (#23868)

Signed-off-by: Xingyu Liu <charlotteliu12x@gmail.com>
Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2025-09-07 19:34:31 -07:00
+								        if params.prompt_logprobs:
 								            num_prompt_logprobs = params.prompt_logprobs
 								            if num_prompt_logprobs == -1:
 								                num_prompt_logprobs = self.model_config.get_vocab_size()
 								            if num_prompt_logprobs > max_logprobs:
 								                raise ValueError(
 								                    f"Requested prompt logprobs of {num_prompt_logprobs}, "
 								                    f"which is is greater than max allowed: {max_logprobs}")
-												[V1] Logprobs and prompt logprobs support (#9880)

This PR is adding support for sample logprobs & prompt logprobs to vLLM v1.

New behavior:

- During model execution, model runner computes sample logprobs (if user-provided logprobs setting is not None) and prompt logprobs (if user-provided prompt_logprobs setting is not None). For both sample and prompt logprobs, the engine core returns 3 vectors: token ids, token logprob values, token ranks. Ranks reflect tokens' 1-indexed positions in the vocabulary vector after sorting the vocabulary by log probability in descending order.
- In scheduler.update_from_output(), sample and prompt logprobs are incorporated into the EngineCoreOutput data structure which is transferred to the engine client. If multiprocessing is enabled, then sample and prompt logprobs will be (de)serialized when the EngineCoreOutput data structure is (de)serialized.
- During output processing, the LogprobsProcessor transforms the triplet of token ids, token logprobs values, and token ranks into the OpenAI-compatible List[Dict[token id,Logprob]] format (for sample and prompt logprobs respectively.)
- Each Logprob instance (whether sample- or prompt-) consists of a token's log-probability, rank, and detokenized string representation. Note that logprob detokenization is handled by the LogprobsProcessor not the detokenizer.

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>


Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
											
										
										
											2025-02-07 10:26:20 -05:00
-												[V1][Frontend] Add Testing For V1 Runtime Parameters (#14159)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
											
										
										
											2025-03-05 14:18:55 +00:00
+								    def _validate_sampling_params(
-												[v1] Support allowed_token_ids in v1 Sampler (#13210)

Signed-off-by: Lu Fang <lufang@fb.com>
											
										
										
											2025-02-21 22:13:05 -08:00
+								        self,
-												[V1][Frontend] Add Testing For V1 Runtime Parameters (#14159)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
											
										
										
											2025-03-05 14:18:55 +00:00
+								        params: SamplingParams,
-												[Bugfix]: v1 engine - consider lora adapters in allowed_token_ids (#17855)

Signed-off-by: Ben Browning <bbrownin@redhat.com>
											
										
										
											2025-05-11 03:53:58 -04:00
+								        lora_request: Optional[LoRARequest],
-												[v1] Support allowed_token_ids in v1 Sampler (#13210)

Signed-off-by: Lu Fang <lufang@fb.com>
											
										
										
											2025-02-21 22:13:05 -08:00
+								    ) -> None:
-												[V1][Core] Support for Structured Outputs (#12388)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
											
										
										
											2025-03-07 10:19:11 -05:00
+								        self._validate_structured_output(params)
-												[Bugfix] Validate logit biases to prevent out of vocab ids crashing engine (#16529)

Signed-off-by: Ryan McConville <ryan@ryanmcconville.com>
											
										
										
											2025-04-12 21:19:19 +01:00
+								        self._validate_logit_bias(params)
-												[V1][Core] Support for Structured Outputs (#12388)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
											
										
										
											2025-03-07 10:19:11 -05:00
-												[v1] Support allowed_token_ids in v1 Sampler (#13210)

Signed-off-by: Lu Fang <lufang@fb.com>
											
										
										
											2025-02-21 22:13:05 -08:00
+								        if params.allowed_token_ids is None:
 								            return
-												[Bugfix][V1] Fix allowed_token_ids for v1 Sampler (#14169)

Signed-off-by: Lu Fang <lufang@fb.com>
											
										
										
											2025-03-05 00:49:44 -08:00
+								        if not params.allowed_token_ids:
 								            raise ValueError("allowed_token_ids is not None and empty!")
-												[Bugfix] fix when skip tokenizer init (#21922)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
											
										
										
											2025-08-02 01:09:36 +08:00
+								        if self.tokenizer is None:
 								            # When skip_tokenizer_init=True, we can't validate token IDs
 								            # Skip validation and let the model handle invalid tokens
 								            return
-												[Bugfix]: v1 engine - consider lora adapters in allowed_token_ids (#17855)

Signed-off-by: Ben Browning <bbrownin@redhat.com>
											
										
										
											2025-05-11 03:53:58 -04:00
+								        tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)
 								        vocab_size = len(tokenizer)
-												[Bugfix][V1] Fix allowed_token_ids for v1 Sampler (#14169)

Signed-off-by: Lu Fang <lufang@fb.com>
											
										
										
											2025-03-05 00:49:44 -08:00
+								        if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids):
-												[v1] Support allowed_token_ids in v1 Sampler (#13210)

Signed-off-by: Lu Fang <lufang@fb.com>
											
										
										
											2025-02-21 22:13:05 -08:00
+								            raise ValueError(
-												[Bugfix][V1] Fix allowed_token_ids for v1 Sampler (#14169)

Signed-off-by: Lu Fang <lufang@fb.com>
											
										
										
											2025-03-05 00:49:44 -08:00
+								                "allowed_token_ids contains out-of-vocab token id!")
-												[v1] Support allowed_token_ids in v1 Sampler (#13210)

Signed-off-by: Lu Fang <lufang@fb.com>
											
										
										
											2025-02-21 22:13:05 -08:00
-												[Bugfix] Validate logit biases to prevent out of vocab ids crashing engine (#16529)

Signed-off-by: Ryan McConville <ryan@ryanmcconville.com>
											
										
										
											2025-04-12 21:19:19 +01:00
+								    def _validate_logit_bias(
 								        self,
 								        params: SamplingParams,
 								    ) -> None:
 								        """Validate logit_bias token IDs are within vocabulary range."""
 								        if not params.logit_bias:
 								            return
 								        vocab_size = self.model_config.get_vocab_size()
 								        invalid_token_ids = []
 								        for token_id in params.logit_bias:
 								            if token_id < 0 or token_id >= vocab_size:
 								                invalid_token_ids.append(token_id)
 								        if invalid_token_ids:
 								            raise ValueError(
 								                f"token_id(s) {invalid_token_ids} in logit_bias contain "
 								                f"out-of-vocab token ids. Vocabulary size: {vocab_size}")
-												[V1][Frontend] Add Testing For V1 Runtime Parameters (#14159)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
											
										
										
											2025-03-05 14:18:55 +00:00
+								    def _validate_supported_sampling_params(
 								        self,
 								        params: SamplingParams,
 								    ) -> None:
-												Reinstate `best_of` for V0 (#14356)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-03-06 17:34:22 +01:00
+								        # Best of not yet supported.
 								        if params.best_of is not None and params.best_of > 1:
-												Correct capitalisation: `VLLM` -> `vLLM` (#14562)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-03-10 17:36:21 +01:00
+								            raise ValueError("vLLM V1 does not yet support best_of.")
-												[V1][Frontend] Add Testing For V1 Runtime Parameters (#14159)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
											
										
										
											2025-03-05 14:18:55 +00:00
+								        # Logits processors not supported.
 								        if params.logits_processors:
-												Correct capitalisation: `VLLM` -> `vLLM` (#14562)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-03-10 17:36:21 +01:00
+								            raise ValueError("vLLM V1 does not support per request "
-												[V1][Frontend] Add Testing For V1 Runtime Parameters (#14159)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
											
										
										
											2025-03-05 14:18:55 +00:00
+								                             "user provided logits processors.")
 								    def _validate_params(
 								        self,
 								        params: Union[SamplingParams, PoolingParams],
-												[Bugfix]: v1 engine - consider lora adapters in allowed_token_ids (#17855)

Signed-off-by: Ben Browning <bbrownin@redhat.com>
											
										
										
											2025-05-11 03:53:58 -04:00
+								        lora_request: Optional[LoRARequest],
-												[V1][Frontend] Add Testing For V1 Runtime Parameters (#14159)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
											
										
										
											2025-03-05 14:18:55 +00:00
+								    ):
 								        """
 								        Validate supported SamplingParam.
 								        Should raise ValueError if unsupported for API Server.
 								        """
-												Support embedding models in V1 (#16188)

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Signed-off-by: Max de Bayser <maxdebayser@gmail.com>
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>
											
										
										
											2025-06-19 01:36:33 -03:00
+								        if isinstance(params, PoolingParams):
 								            return
-												[V1][Frontend] Add Testing For V1 Runtime Parameters (#14159)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
											
										
										
											2025-03-05 14:18:55 +00:00
 								        self._validate_logprobs(params)
-												[Bugfix]: v1 engine - consider lora adapters in allowed_token_ids (#17855)

Signed-off-by: Ben Browning <bbrownin@redhat.com>
											
										
										
											2025-05-11 03:53:58 -04:00
+								        self._validate_sampling_params(params, lora_request)
-												[V1][Frontend] Add Testing For V1 Runtime Parameters (#14159)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
											
										
										
											2025-03-05 14:18:55 +00:00
+								        self._validate_supported_sampling_params(params)
-												[Core][Multimodal] Allow passing `multi_modal_uuids` as multimodal identifiers. (#23394)

Signed-off-by: Roger Wang <hey@rogerw.io>
											
										
										
											2025-08-30 18:01:22 -07:00
+								    def _validate_multi_modal_uuids(self, prompt: PromptType) -> None:
 								        """
 								        Validate that user-provided multi_modal_uuids align with
 								        multi_modal_data in the incoming request prompt(s).
 								        Only checks lengths; `None` entries are allowed and will be
 								        auto-hashed downstream.
 								        """
 								        def _validate_single_prompt(single_prompt: Union[dict, str]) -> None:
 								            if not isinstance(single_prompt, dict):
 								                return
 								            mm_data = single_prompt.get("multi_modal_data")
 								            mm_uuids = single_prompt.get("multi_modal_uuids")
 								            if not mm_data or not mm_uuids:
 								                return
 								            for modality, items in mm_data.items():
 								                if modality in mm_uuids:
 								                    data_len = len(items) if isinstance(items, list) else 1
 								                    uuid_len = len(mm_uuids[modality]) if isinstance(
 								                        mm_uuids[modality], list) else 1
 								                    if uuid_len != data_len:
 								                        raise ValueError(
 								                            f"multi_modal_uuids for modality '{modality}' "
 								                            "must have same length as data: got "
 								                            f"{uuid_len} uuids vs "
 								                            f"{data_len} items.")
 								                else:
 								                    raise ValueError(
 								                        f"multi_modal_uuids for modality '{modality}' must "
 								                        "be provided if multi_modal_data is provided.")
 								        # Handle explicit encoder/decoder prompts or singleton prompt
 								        if isinstance(prompt, dict) and "encoder_prompt" in prompt:
 								            enc = prompt.get("encoder_prompt")
 								            dec = prompt.get("decoder_prompt")
 								            if enc is not None:
 								                _validate_single_prompt(enc)
 								            if dec is not None:
 								                _validate_single_prompt(dec)
 								        else:
 								            _validate_single_prompt(prompt)  # type: ignore[arg-type]
-												[V1][Frontend] Add Testing For V1 Runtime Parameters (#14159)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
											
										
										
											2025-03-05 14:18:55 +00:00
+								    def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None:
 								        if lora_request is not None and not self.lora_config:
 								            raise ValueError(f"Got lora_request {lora_request} but LoRA is "
 								                             "not enabled!")
-												[V1][Core] Support for Structured Outputs (#12388)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
											
										
										
											2025-03-07 10:19:11 -05:00
+								    def _validate_structured_output(self, params: SamplingParams) -> None:
 								        if not params.guided_decoding or not self.decoding_config:
 								            return
-												[V1] Refactor Structured Output for multiple backends (#14694)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
											
										
										
											2025-03-18 15:49:15 -04:00
-												[Structured Outputs][V1] Skipping with models doesn't contain tokenizers (#20365)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Co-authored-by: Nick Hill <nhill@redhat.com>
											
										
										
											2025-07-04 03:05:49 -04:00
+								        if self.model_config.skip_tokenizer_init and params.guided_decoding:
 								            raise ValueError(
 								                "Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'"  # noqa: E501
 								            )
-												Simplify (and fix) passing of guided decoding backend options (#17008)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-04-29 20:02:23 +01:00
+								        engine_level_backend = self.decoding_config.backend
-												[V1] Refactor Structured Output for multiple backends (#14694)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
											
										
										
											2025-03-18 15:49:15 -04:00
+								        if params.guided_decoding.backend:
-												[Bugfix] Fix the issue where llm.generate cannot be called repeatedly after setting GuidedDecodingParams (#16767)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
											
										
										
											2025-04-22 14:02:20 +08:00
+								            # Request-level backend selection is not supported in V1.
 								            # The values may differ if `params` is reused and was set
 								            # to a specific backend based on `auto` behavior in a previous
 								            # request. We remember that it was set as a result of `auto`
 								            # using the `_auto` option set on the backend in the params.
 								            if (params.guided_decoding.backend != engine_level_backend
-												Simplify (and fix) passing of guided decoding backend options (#17008)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-04-29 20:02:23 +01:00
+								                    and not (engine_level_backend == "auto"
 								                             and params.guided_decoding.backend_was_auto)):
-												Improve error for structured output backend selection (#16717)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-04-17 01:35:35 +01:00
+								                raise ValueError(
 								                    "Request-level structured output backend selection is no "
 								                    "longer supported. The request specified "
 								                    f"'{params.guided_decoding.backend}', but vLLM was "
 								                    f"initialised with '{engine_level_backend}'. This error "
 								                    "can be resolved by removing backend selection from the "
 								                    "request.")
-												[V1] Refactor Structured Output for multiple backends (#14694)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
											
										
										
											2025-03-18 15:49:15 -04:00
+								        else:
 								            params.guided_decoding.backend = engine_level_backend
-												[V1][Structured Output] Add `supports_structured_output()` method to Platform (#16148)

Signed-off-by: shen-shanshan <467638484@qq.com>
											
										
										
											2025-04-07 19:06:24 +08:00
-												[V1] guidance backend for structured output + `auto` fallback mode (#14779)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Loc Huynh <jc1da.3011@gmail.com>
Co-authored-by: Michal Moskal <michal@moskal.me>
											
										
										
											2025-03-25 00:02:33 -04:00
+								        # Request content validation
-												[Bugfix] Properly reject requests with empty list guided_choice (#20195)

Signed-off-by: mgoin <mgoin64@gmail.com>
											
										
										
											2025-06-28 14:50:52 +09:00
+								        if (isinstance(params.guided_decoding.choice, list)
 								                and not params.guided_decoding.choice):
 								            # It is invalid for choice to be an empty list
 								            raise ValueError(f"Choice '{params.guided_decoding.choice}' "
 								                             "cannot be an empty list")
-												[V1] Support disable_any_whtespace for guidance backend (#15584)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
											
										
										
											2025-03-28 11:46:45 -04:00
+								        if engine_level_backend.startswith("xgrammar"):
-												[V1] guidance backend for structured output + `auto` fallback mode (#14779)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Loc Huynh <jc1da.3011@gmail.com>
Co-authored-by: Michal Moskal <michal@moskal.me>
											
										
										
											2025-03-25 00:02:33 -04:00
+								            # xgrammar with no fallback
-												[V1][Structured Output] Move xgrammar related utils to `backend_xgrammar.py` (#16578)

Signed-off-by: shen-shanshan <467638484@qq.com>
											
										
										
											2025-04-16 17:01:36 +08:00
+								            validate_xgrammar_grammar(params)
-												[V1][Structured Output] Minor modification to `_validate_structured_output()` (#16748)

Signed-off-by: shen-shanshan <467638484@qq.com>
											
										
										
											2025-04-18 13:12:54 +08:00
+								        elif engine_level_backend.startswith("guidance"):
 								            # TODO: ideally we would have the LLTokenizer here as Lark syntax
 								            # allows <|special_token|> and similar, see
 								            # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens
 								            # Without tokenizer these are disallowed in grammars.
 								            validate_guidance_grammar(params, tokenizer=None)
-												[V0][V1][Core] Add outlines integration for V1, and update V0 integration. (#15975)

Signed-off-by: Nathan Hoos <thwackyy.y@gmail.com>
											
										
										
											2025-07-10 14:30:26 -05:00
+								        elif engine_level_backend == "outlines":
 								            # outlines backend
 								            validate_structured_output_request_outlines(params)
-												Frontend: Adding LM Format Enforcer support to V1 engine (#22564)

Signed-off-by: Noam Gat <noamgat@gmail.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2025-08-25 05:31:22 +03:00
+								        elif engine_level_backend == "lm-format-enforcer":
 								            # lm format enforcer backend
 								            validate_structured_output_request_lm_format_enforcer(params)
-												[V1][Structured Output] Minor modification to `_validate_structured_output()` (#16748)

Signed-off-by: shen-shanshan <467638484@qq.com>
											
										
										
											2025-04-18 13:12:54 +08:00
+								        else:
 								            # NOTE: engine_level_backend must be "auto" here, because we have
 								            # checked supported_backends above.
-												[V1] guidance backend for structured output + `auto` fallback mode (#14779)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Loc Huynh <jc1da.3011@gmail.com>
Co-authored-by: Michal Moskal <michal@moskal.me>
											
										
										
											2025-03-25 00:02:33 -04:00
+								            # "auto" is an opt-in to opinionated behavior where we try to
 								            # choose a backend based on request contents. This is not the
 								            # default as it is less predictable and subject to change
 								            # between releases as feature support changes.
 								            try:
-												[V1][Structured Output] Move xgrammar related utils to `backend_xgrammar.py` (#16578)

Signed-off-by: shen-shanshan <467638484@qq.com>
											
										
										
											2025-04-16 17:01:36 +08:00
+								                validate_xgrammar_grammar(params)
-												[V1] guidance backend for structured output + `auto` fallback mode (#14779)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Loc Huynh <jc1da.3011@gmail.com>
Co-authored-by: Michal Moskal <michal@moskal.me>
											
										
										
											2025-03-25 00:02:33 -04:00
+								                params.guided_decoding.backend = "xgrammar"
 								            except ValueError:
-												[Bugfix] validate grammar and throw 400 error instead of crashing the engine when xgrammar validation fails (#17623)

Signed-off-by: Jason Cheng <jasoncky96@gmail.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
											
										
										
											2025-05-12 09:06:10 +08:00
+								                # The request either failed validation
 								                # or includes some jsonschema feature(s) that
-												[V1] guidance backend for structured output + `auto` fallback mode (#14779)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Loc Huynh <jc1da.3011@gmail.com>
Co-authored-by: Michal Moskal <michal@moskal.me>
											
										
										
											2025-03-25 00:02:33 -04:00
+								                # are not supported in xgrammar. Fall back to guidance.
-												[Bugfix] validate grammar and throw 400 error instead of crashing the engine when xgrammar validation fails (#17623)

Signed-off-by: Jason Cheng <jasoncky96@gmail.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
											
										
										
											2025-05-12 09:06:10 +08:00
+								                validate_guidance_grammar(params, tokenizer=None)
-												[V1] guidance backend for structured output + `auto` fallback mode (#14779)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Loc Huynh <jc1da.3011@gmail.com>
Co-authored-by: Michal Moskal <michal@moskal.me>
											
										
										
											2025-03-25 00:02:33 -04:00
+								                params.guided_decoding.backend = "guidance"
-												[Bugfix] Fix the issue where llm.generate cannot be called repeatedly after setting GuidedDecodingParams (#16767)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
											
										
										
											2025-04-22 14:02:20 +08:00
+								            # Remember that this backend was set automatically
-												Simplify (and fix) passing of guided decoding backend options (#17008)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-04-29 20:02:23 +01:00
+								            params.guided_decoding.backend_was_auto = True
-												[V1] guidance backend for structured output + `auto` fallback mode (#14779)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Loc Huynh <jc1da.3011@gmail.com>
Co-authored-by: Michal Moskal <michal@moskal.me>
											
										
										
											2025-03-25 00:02:33 -04:00
-												[Core] Simplify and unify mm uuid handling & auto-generated mm hash overrides processing.  (#24271)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
											
										
										
											2025-09-09 21:36:09 -07:00
+								    def _maybe_build_mm_uuids(
-												[Multimodal] Generate mm_hash based on request metadata when caching is turned off (#23690)

Signed-off-by: Roger Wang <hey@rogerw.io>
											
										
										
											2025-08-27 13:24:31 -07:00
+								        self,
 								        request_id: str,
 								        prompt: PromptType,
-												[Core] Simplify and unify mm uuid handling & auto-generated mm hash overrides processing.  (#24271)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
											
										
										
											2025-09-09 21:36:09 -07:00
+								    ) -> Optional[MultiModalUUIDDict]:
-												[Multimodal] Generate mm_hash based on request metadata when caching is turned off (#23690)

Signed-off-by: Roger Wang <hey@rogerw.io>
											
										
										
											2025-08-27 13:24:31 -07:00
+								        """Build per-item multimodal hash overrides when enabled. In this case,
 								        multimodal data items are identified by their request id, modality and
 								        index rather than their content.
 								        Returns a dictionary of modality -> list[str] of overrides, or None if
 								        disabled or no multimodal data is present.
 								        """
 								        def _extract_mm_data(p: PromptType):
 								            if isinstance(p, dict) and "encoder_prompt" in p:
 								                enc = p.get("encoder_prompt")
 								                if isinstance(enc, dict):
 								                    return enc.get("multi_modal_data")
 								                return None
 								            if isinstance(p, dict):
 								                return p.get("multi_modal_data")
 								            return None
 								        mm_data = _extract_mm_data(prompt)
 								        if not mm_data:
 								            return None
-												[Core] Simplify and unify mm uuid handling & auto-generated mm hash overrides processing.  (#24271)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
											
										
										
											2025-09-09 21:36:09 -07:00
+								        mm_uuids: MultiModalUUIDDict = {}
-												[Multimodal] Generate mm_hash based on request metadata when caching is turned off (#23690)

Signed-off-by: Roger Wang <hey@rogerw.io>
											
										
										
											2025-08-27 13:24:31 -07:00
+								        for modality, data in mm_data.items():
 								            n = len(data) if isinstance(data, list) else 1
-												[Core] Simplify and unify mm uuid handling & auto-generated mm hash overrides processing.  (#24271)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
											
										
										
											2025-09-09 21:36:09 -07:00
+								            mm_uuids[modality] = [
-												[Multimodal] Generate mm_hash based on request metadata when caching is turned off (#23690)

Signed-off-by: Roger Wang <hey@rogerw.io>
											
										
										
											2025-08-27 13:24:31 -07:00
+								                f"{request_id}-{modality}-{i}" for i in range(n)
 								            ]
-												[Core] Simplify and unify mm uuid handling & auto-generated mm hash overrides processing.  (#24271)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
											
										
										
											2025-09-09 21:36:09 -07:00
+								        return mm_uuids
-												[Multimodal] Generate mm_hash based on request metadata when caching is turned off (#23690)

Signed-off-by: Roger Wang <hey@rogerw.io>
											
										
										
											2025-08-27 13:24:31 -07:00
-												[V1] `AsyncLLM` Implementation (#9826)

Signed-off-by: Nick Hill <nickhill@us.ibm.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
											
										
										
											2024-11-11 18:05:38 -05:00
+								    def process_inputs(
 								        self,
 								        request_id: str,
 								        prompt: PromptType,
 								        params: Union[SamplingParams, PoolingParams],
-												Enable mypy checking on V1 code (#11105)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
											
										
										
											2024-12-14 17:54:04 +00:00
+								        arrival_time: Optional[float] = None,
-												[V1] `AsyncLLM` Implementation (#9826)

Signed-off-by: Nick Hill <nickhill@us.ibm.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
											
										
										
											2024-11-11 18:05:38 -05:00
+								        lora_request: Optional[LoRARequest] = None,
-												Truncation control for embedding models (#14776)

Signed-off-by: Gabriel Marinho <gmarinho@ibm.com>
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Co-authored-by: Max de Bayser <mbayser@br.ibm.com>
											
										
										
											2025-04-29 22:24:57 -03:00
+								        tokenization_kwargs: Optional[dict[str, Any]] = None,
-												[V1] `AsyncLLM` Implementation (#9826)

Signed-off-by: Nick Hill <nickhill@us.ibm.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
											
										
										
											2024-11-11 18:05:38 -05:00
+								        trace_headers: Optional[Mapping[str, str]] = None,
 								        priority: int = 0,
-												Allow AsyncLLMEngine.generate to target a specific DP rank (#19102)

Signed-off-by: Jon Swenson <jmswen@gmail.com>
											
										
										
											2025-06-04 08:26:47 -07:00
+								        data_parallel_rank: Optional[int] = None,
-												[Core] Remove prompt string from engine core data structures (#17214)

Signed-off-by: Nick Hill <nhill@redhat.com>
											
										
										
											2025-04-25 23:41:05 -07:00
+								    ) -> tuple[Optional[str], EngineCoreRequest]:
-												[V1] `AsyncLLM` Implementation (#9826)

Signed-off-by: Nick Hill <nickhill@us.ibm.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
											
										
										
											2024-11-11 18:05:38 -05:00
-												[Doc] Update docs to refer to pooling models (#11093)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-12-11 21:36:27 +08:00
+								        # TODO(woosuk): Support pooling models.
-												[V1] Logprobs and prompt logprobs support (#9880)

This PR is adding support for sample logprobs & prompt logprobs to vLLM v1.

New behavior:

- During model execution, model runner computes sample logprobs (if user-provided logprobs setting is not None) and prompt logprobs (if user-provided prompt_logprobs setting is not None). For both sample and prompt logprobs, the engine core returns 3 vectors: token ids, token logprob values, token ranks. Ranks reflect tokens' 1-indexed positions in the vocabulary vector after sorting the vocabulary by log probability in descending order.
- In scheduler.update_from_output(), sample and prompt logprobs are incorporated into the EngineCoreOutput data structure which is transferred to the engine client. If multiprocessing is enabled, then sample and prompt logprobs will be (de)serialized when the EngineCoreOutput data structure is (de)serialized.
- During output processing, the LogprobsProcessor transforms the triplet of token ids, token logprobs values, and token ranks into the OpenAI-compatible List[Dict[token id,Logprob]] format (for sample and prompt logprobs respectively.)
- Each Logprob instance (whether sample- or prompt-) consists of a token's log-probability, rank, and detokenized string representation. Note that logprob detokenization is handled by the LogprobsProcessor not the detokenizer.

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>


Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
											
										
										
											2025-02-07 10:26:20 -05:00
+								        self._validate_lora(lora_request)
-												[Bugfix]: v1 engine - consider lora adapters in allowed_token_ids (#17855)

Signed-off-by: Ben Browning <bbrownin@redhat.com>
											
										
										
											2025-05-11 03:53:58 -04:00
+								        self._validate_params(params, lora_request)
-												[V1][Frontend] Add Testing For V1 Runtime Parameters (#14159)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
											
										
										
											2025-03-05 14:18:55 +00:00
+								        if trace_headers is not None:
 								            raise ValueError("V1 does not support tracing yet.")
-												[V1] Logprobs and prompt logprobs support (#9880)

This PR is adding support for sample logprobs & prompt logprobs to vLLM v1.

New behavior:

- During model execution, model runner computes sample logprobs (if user-provided logprobs setting is not None) and prompt logprobs (if user-provided prompt_logprobs setting is not None). For both sample and prompt logprobs, the engine core returns 3 vectors: token ids, token logprob values, token ranks. Ranks reflect tokens' 1-indexed positions in the vocabulary vector after sorting the vocabulary by log probability in descending order.
- In scheduler.update_from_output(), sample and prompt logprobs are incorporated into the EngineCoreOutput data structure which is transferred to the engine client. If multiprocessing is enabled, then sample and prompt logprobs will be (de)serialized when the EngineCoreOutput data structure is (de)serialized.
- During output processing, the LogprobsProcessor transforms the triplet of token ids, token logprobs values, and token ranks into the OpenAI-compatible List[Dict[token id,Logprob]] format (for sample and prompt logprobs respectively.)
- Each Logprob instance (whether sample- or prompt-) consists of a token's log-probability, rank, and detokenized string representation. Note that logprob detokenization is handled by the LogprobsProcessor not the detokenizer.

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>


Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
											
										
										
											2025-02-07 10:26:20 -05:00
-												[Core] Raise when non-multi-instance DP clients target a DP rank (#19227)

Signed-off-by: Jon Swenson <jmswen@gmail.com>
											
										
										
											2025-06-06 04:03:01 -07:00
+								        data_parallel_size = self.vllm_config.parallel_config.data_parallel_size
 								        if data_parallel_rank is not None and not (0 <= data_parallel_rank <
 								                                                   data_parallel_size):
 								            raise ValueError(f"data_parallel_rank {data_parallel_rank} "
 								                             f"is out of range [0, {data_parallel_size}).")
-												[V1] `AsyncLLM` Implementation (#9826)

Signed-off-by: Nick Hill <nickhill@us.ibm.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
											
										
										
											2024-11-11 18:05:38 -05:00
+								        if arrival_time is None:
 								            arrival_time = time.time()
-												[Core][Multimodal] Allow passing `multi_modal_uuids` as multimodal identifiers. (#23394)

Signed-off-by: Roger Wang <hey@rogerw.io>
											
										
										
											2025-08-30 18:01:22 -07:00
+								        # Optionally generate multimodal hash overrides to avoid hashing
 								        # multimodal data items by their content as their identifiers.
-												[Multimodal] Generate mm_hash based on request metadata when caching is turned off (#23690)

Signed-off-by: Roger Wang <hey@rogerw.io>
											
										
										
											2025-08-27 13:24:31 -07:00
+								        # NOTE: when users explicitly turn off BOTH prefix caching and input
 								        # processing caching, no multimodal features or embeddings will be
-												[Core][Multimodal] Allow passing `multi_modal_uuids` as multimodal identifiers. (#23394)

Signed-off-by: Roger Wang <hey@rogerw.io>
											
										
										
											2025-08-30 18:01:22 -07:00
+								        # reused across requests, therefore identifying multimodal data items
 								        # by their content is no longer necessary, and we create uuids with
 								        # request id-modality-index as multimodal hash overrides.
-												[Multimodal] Generate mm_hash based on request metadata when caching is turned off (#23690)

Signed-off-by: Roger Wang <hey@rogerw.io>
											
										
										
											2025-08-27 13:24:31 -07:00
+								        if (self.model_config.multimodal_config and
 								                self.model_config.multimodal_config.mm_processor_cache_gb == 0
 								                and not self.cache_config.enable_prefix_caching):
-												[Core] Simplify and unify mm uuid handling & auto-generated mm hash overrides processing.  (#24271)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
											
										
										
											2025-09-09 21:36:09 -07:00
+								            mm_uuids = self._maybe_build_mm_uuids(request_id, prompt)
-												[Multimodal] Generate mm_hash based on request metadata when caching is turned off (#23690)

Signed-off-by: Roger Wang <hey@rogerw.io>
											
										
										
											2025-08-27 13:24:31 -07:00
+								        else:
-												[Core][Multimodal] Allow passing `multi_modal_uuids` as multimodal identifiers. (#23394)

Signed-off-by: Roger Wang <hey@rogerw.io>
											
										
										
											2025-08-30 18:01:22 -07:00
+								            # Otherwise, use user-provided uuids as multimodal hash overrides
 								            # if provided.
 								            self._validate_multi_modal_uuids(prompt)
 								            if isinstance(prompt, dict):
-												[Core] Simplify and unify mm uuid handling & auto-generated mm hash overrides processing.  (#24271)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
											
										
										
											2025-09-09 21:36:09 -07:00
+								                mm_uuids = prompt.get("multi_modal_uuids")
-												[Core][Multimodal] Allow passing `multi_modal_uuids` as multimodal identifiers. (#23394)

Signed-off-by: Roger Wang <hey@rogerw.io>
											
										
										
											2025-08-30 18:01:22 -07:00
+								            else:
-												[Core] Simplify and unify mm uuid handling & auto-generated mm hash overrides processing.  (#24271)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
											
										
										
											2025-09-09 21:36:09 -07:00
+								                mm_uuids = None
-												[Multimodal] Generate mm_hash based on request metadata when caching is turned off (#23690)

Signed-off-by: Roger Wang <hey@rogerw.io>
											
										
										
											2025-08-27 13:24:31 -07:00
-												[V1] Clarify input processing and multimodal feature caching logic (#13211)


											
										
										
											2025-02-13 03:43:24 -08:00
+								        # Process inputs, which includes:
 								        # 1. Tokenize text prompt, with LoRA request if one exists.
 								        # 2. For multimodal models with a merged preprocessor, preprocess
 								        #   multimodal data and expand prompt token ids accordingly.
-												[V1] Remove input cache client (#14864)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-03-17 14:42:06 +08:00
+								        processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
-												[V1] `AsyncLLM` Implementation (#9826)

Signed-off-by: Nick Hill <nickhill@us.ibm.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
											
										
										
											2024-11-11 18:05:38 -05:00
+								            prompt,
-												Truncation control for embedding models (#14776)

Signed-off-by: Gabriel Marinho <gmarinho@ibm.com>
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Co-authored-by: Max de Bayser <mbayser@br.ibm.com>
											
										
										
											2025-04-29 22:24:57 -03:00
+								            tokenization_kwargs=tokenization_kwargs,
-												[V1] `AsyncLLM` Implementation (#9826)

Signed-off-by: Nick Hill <nickhill@us.ibm.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
											
										
										
											2024-11-11 18:05:38 -05:00
+								            lora_request=lora_request,
-												[Core] Simplify and unify mm uuid handling & auto-generated mm hash overrides processing.  (#24271)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
											
										
										
											2025-09-09 21:36:09 -07:00
+								            mm_uuids=mm_uuids,
-												[V1] `AsyncLLM` Implementation (#9826)

Signed-off-by: Nick Hill <nickhill@us.ibm.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
											
										
										
											2024-11-11 18:05:38 -05:00
+								        )
-												[Hardware] Add processor inputs to platform validation (#16680)

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
											
										
										
											2025-04-16 18:28:42 +02:00
+								        from vllm.platforms import current_platform
 								        current_platform.validate_request(
 								            prompt=prompt,
 								            params=params,
 								            processed_inputs=processed_inputs,
 								        )
-												[Core][Multimodal] Allow passing `multi_modal_uuids` as multimodal identifiers. (#23394)

Signed-off-by: Roger Wang <hey@rogerw.io>
											
										
										
											2025-08-30 18:01:22 -07:00
-												[V1] Clarify input processing and multimodal feature caching logic (#13211)


											
										
										
											2025-02-13 03:43:24 -08:00
+								        eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
-												[V1] V1 Enablement Oracle  (#13726)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
											
										
										
											2025-03-15 01:02:20 -04:00
+								        self._validate_model_inputs(processed_inputs, lora_request)
-												[V1] `AsyncLLM` Implementation (#9826)

Signed-off-by: Nick Hill <nickhill@us.ibm.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
											
										
										
											2024-11-11 18:05:38 -05:00
-												[Misc] Replace `is_encoder_decoder_inputs` with `split_enc_dec_inputs` (#15620)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-28 01:36:32 +08:00
+								        encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
-												[1/N] Initial prototype for multi-modal processor (#10044)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-11-13 20:39:03 +08:00
-												Support embedding models in V1 (#16188)

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Signed-off-by: Max de Bayser <maxdebayser@gmail.com>
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>
											
										
										
											2025-06-19 01:36:33 -03:00
+								        sampling_params = None
 								        pooling_params = None
 								        if isinstance(params, SamplingParams):
 								            # TODO: can we avoid cloning here in multiproc case?
 								            sampling_params = params.clone()
 								            # If unset max tokens, then generate up to the max_model_len.
 								            if sampling_params.max_tokens is None:
 								                sampling_params.max_tokens = (
 								                    self.model_config.max_model_len -
 								                    len(decoder_inputs["prompt_token_ids"]))
 								            sampling_params.update_from_generation_config(
 								                self.generation_config_fields, eos_token_id)
-												[Bugfix] fix when skip tokenizer init (#21922)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
											
										
										
											2025-08-02 01:09:36 +08:00
+								            if self.tokenizer is not None:
 								                sampling_params.update_from_tokenizer(
 								                    self.tokenizer.get_lora_tokenizer(lora_request))
-												Support embedding models in V1 (#16188)

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Signed-off-by: Max de Bayser <maxdebayser@gmail.com>
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>
											
										
										
											2025-06-19 01:36:33 -03:00
+								        else:
 								            pooling_params = params.clone()
-												[V1] `AsyncLLM` Implementation (#9826)

Signed-off-by: Nick Hill <nickhill@us.ibm.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
											
										
										
											2024-11-11 18:05:38 -05:00
-												[V1] Extend beyond image modality and support mixed-modality inference with Llava-OneVision (#11685)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-01-06 11:58:16 -08:00
+								        # Multimodal related.
-												[Multimodal] Consolidate mm inputs into MultiModalFeatureSpec (#23779)

Signed-off-by: sfeng33 <4florafeng@gmail.com>
											
										
										
											2025-08-29 03:36:57 -07:00
+								        mm_features: Optional[list[MultiModalFeatureSpec]] = None
-												[V1] Remove legacy input registry (#15673)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-28 14:34:34 +08:00
+								        if decoder_inputs["type"] == "multimodal":
 								            decoder_mm_inputs = decoder_inputs["mm_kwargs"]
-												[Core] Use individual MM items in P0/P1 cache and model runner (#22570)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-08-13 22:18:07 +08:00
+								            decoder_mm_positions = decoder_inputs["mm_placeholders"]
-												[Multimodal] Always enable hashing mm data (#23308)

Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-08-21 07:23:28 -07:00
+								            decoder_mm_hashes = decoder_inputs["mm_hashes"]
-												[V1] Extend beyond image modality and support mixed-modality inference with Llava-OneVision (#11685)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-01-06 11:58:16 -08:00
 								            # Merge and flatten multimodal placeholders, hashes and inputs
 								            # from dictionaries to lists, and sort them by each item's position
 								            # in the input sequence.
-												[Core] Use individual MM items in P0/P1 cache and model runner (#22570)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-08-13 22:18:07 +08:00
+								            sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions)
-												[Multimodal] Consolidate mm inputs into MultiModalFeatureSpec (#23779)

Signed-off-by: sfeng33 <4florafeng@gmail.com>
											
										
										
											2025-08-29 03:36:57 -07:00
+								            mm_features = []
 								            for modality, idx in sorted_mm_idxs:
 								                mm_features.append(
 								                    MultiModalFeatureSpec(
 								                        data=decoder_mm_inputs[modality][idx],
 								                        modality=modality,
 								                        identifier=decoder_mm_hashes[modality][idx],
 								                        mm_position=decoder_mm_positions[modality][idx]))
-												[V1] Extend beyond image modality and support mixed-modality inference with Llava-OneVision (#11685)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-01-06 11:58:16 -08:00
-												[Core] Remove prompt string from engine core data structures (#17214)

Signed-off-by: Nick Hill <nhill@redhat.com>
											
										
										
											2025-04-25 23:41:05 -07:00
+								        return decoder_inputs.get("prompt"), EngineCoreRequest(
-												[V1] Extend beyond image modality and support mixed-modality inference with Llava-OneVision (#11685)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-01-06 11:58:16 -08:00
+								            request_id=request_id,
-												[V1] Remove legacy input registry (#15673)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-28 14:34:34 +08:00
+								            prompt_token_ids=decoder_inputs["prompt_token_ids"],
-												[Multimodal] Consolidate mm inputs into MultiModalFeatureSpec (#23779)

Signed-off-by: sfeng33 <4florafeng@gmail.com>
											
										
										
											2025-08-29 03:36:57 -07:00
+								            mm_features=mm_features,
-												[V1] Extend beyond image modality and support mixed-modality inference with Llava-OneVision (#11685)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-01-06 11:58:16 -08:00
+								            sampling_params=sampling_params,
-												Support embedding models in V1 (#16188)

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Signed-off-by: Max de Bayser <maxdebayser@gmail.com>
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>
											
										
										
											2025-06-19 01:36:33 -03:00
+								            pooling_params=pooling_params,
-												[V1] Extend beyond image modality and support mixed-modality inference with Llava-OneVision (#11685)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-01-06 11:58:16 -08:00
+								            eos_token_id=eos_token_id,
 								            arrival_time=arrival_time,
 								            lora_request=lora_request,
-												[Core] Prevent side-channel attacks via cache salting (#17045)

Signed-off-by: Marko Rosenmueller <5467316+dr75@users.noreply.github.com>
											
										
										
											2025-04-30 14:27:21 +02:00
+								            cache_salt=decoder_inputs.get("cache_salt"),
-												[Core] feat: Implement Priority Scheduling in V1 Engine (#19057)

Signed-off-by: amit <amit.man@gmail.com>
Co-authored-by: Roger Wang <Rogerw0108@gmail.com>
											
										
										
											2025-06-23 06:18:08 +03:00
+								            priority=priority,
-												Allow AsyncLLMEngine.generate to target a specific DP rank (#19102)

Signed-off-by: Jon Swenson <jmswen@gmail.com>
											
										
										
											2025-06-04 08:26:47 -07:00
+								            data_parallel_rank=data_parallel_rank,
-												[1/N] Initial prototype for multi-modal processor (#10044)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-11-13 20:39:03 +08:00
+								        )
-												[V1] `AsyncLLM` Implementation (#9826)

Signed-off-by: Nick Hill <nickhill@us.ibm.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
											
										
										
											2024-11-11 18:05:38 -05:00
-												[V1] V1 Enablement Oracle  (#13726)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
											
										
										
											2025-03-15 01:02:20 -04:00
+								    def _validate_model_inputs(self,
 								                               inputs: ProcessorInputs,
 								                               lora_request: Optional[LoRARequest] = None):
-												[Misc] Replace `is_encoder_decoder_inputs` with `split_enc_dec_inputs` (#15620)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-28 01:36:32 +08:00
+								        encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs)
-												[Bugfix] Proper input validation for multi-modal encoder-decoder models (#16156)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-09 00:45:21 +08:00
+								        if encoder_inputs is not None:
 								            self._validate_model_input(encoder_inputs,
 								                                       lora_request,
 								                                       prompt_type="encoder")
-												[1/N] Initial prototype for multi-modal processor (#10044)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-11-13 20:39:03 +08:00
-												[Bugfix] Proper input validation for multi-modal encoder-decoder models (#16156)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-09 00:45:21 +08:00
+								        self._validate_model_input(decoder_inputs,
 								                                   lora_request,
 								                                   prompt_type="decoder")
-												[1/N] Initial prototype for multi-modal processor (#10044)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-11-13 20:39:03 +08:00
-												[Bugfix] Proper input validation for multi-modal encoder-decoder models (#16156)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-09 00:45:21 +08:00
+								    def _validate_model_input(
 								        self,
 								        prompt_inputs: SingletonInputs,
 								        lora_request: Optional[LoRARequest],
 								        *,
 								        prompt_type: Literal["encoder", "decoder"],
 								    ):
-												[Bugfix] Fix validation error for text-only Mllama 3.2 (#16377)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-10 14:19:42 +08:00
+								        model_config = self.model_config
-												[V1] `AsyncLLM` Implementation (#9826)

Signed-off-by: Nick Hill <nickhill@us.ibm.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
											
										
										
											2024-11-11 18:05:38 -05:00
-												[Bugfix] Proper input validation for multi-modal encoder-decoder models (#16156)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-09 00:45:21 +08:00
+								        prompt_ids = prompt_inputs["prompt_token_ids"]
 								        if not prompt_ids:
-												[Bugfix] Fix validation error for text-only Mllama 3.2 (#16377)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-10 14:19:42 +08:00
+								            if prompt_type == "encoder" and model_config.is_multimodal_model:
 								                pass  # Mllama may have empty encoder inputs for text-only data
 								            else:
 								                raise ValueError(f"The {prompt_type} prompt cannot be empty")
-												[Bugfix] Proper input validation for multi-modal encoder-decoder models (#16156)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-09 00:45:21 +08:00
-												 [Core][Model] PrithviMAE Enablement on vLLM v1 engine (#20577)

Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
											
										
										
											2025-07-23 19:00:23 +01:00
+								        if self.model_config.skip_tokenizer_init:
 								            tokenizer = None
 								        else:
 								            tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)
 								            max_input_id = max(prompt_ids, default=0)
-												vllm fix check on max vocab size (#22471)

Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-31 05:57:05 -07:00
 								            # NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while
 								            # self.model_config.get_vocab_size() is the model’s vocab size.
 								            # For Qwen3 models, the language model has extra tokens that do
 								            # not exist in the tokenizer, and vice versa for multimodal
 								            # placeholder tokens in some multimodal models.
 								            # See https://github.com/QwenLM/Qwen3/issues/29#issuecomment-1933720399 # noqa: E501
 								            # and https://github.com/vllm-project/vllm/pull/22471#discussion_r2312251421 # noqa: E501
 								            # Here we take the max of the two to determine if a token id is
 								            # truly out-of-vocabulary.
 								            if max_input_id > max(tokenizer.max_token_id,
 								                                  self.model_config.get_vocab_size() - 1):
-												 [Core][Model] PrithviMAE Enablement on vLLM v1 engine (#20577)

Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
											
										
										
											2025-07-23 19:00:23 +01:00
+								                raise ValueError(
 								                    f"Token id {max_input_id} is out of vocabulary")
-												[Bugfix] Proper input validation for multi-modal encoder-decoder models (#16156)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-09 00:45:21 +08:00
 								        max_prompt_len = self.model_config.max_model_len
-												[Bugfix] Revert max_prompt_len validation for decoder-only models. (#16741)

Signed-off-by: David Heineman <david@davidheineman.com>
											
										
										
											2025-04-16 21:33:15 -07:00
+								        if len(prompt_ids) > max_prompt_len:
-												[Bugfix] Fix validation error for text-only Mllama 3.2 (#16377)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-10 14:19:42 +08:00
+								            if prompt_type == "encoder" and model_config.is_multimodal_model:
 								                mm_registry = self.input_preprocessor.mm_registry
 								                mm_processor = mm_registry.create_processor(
 								                    model_config,
 								                    tokenizer=tokenizer,
 								                )
 								                assert isinstance(mm_processor, EncDecMultiModalProcessor)
 								                if mm_processor.pad_dummy_encoder_prompt:
-												[New Model]Donut model (#23229)

Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
											
										
										
											2025-08-24 20:52:24 +08:00
+								                    return  # Skip encoder length check for Whisper and Donut
-												[Bugfix] Fix validation error for text-only Mllama 3.2 (#16377)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-10 14:19:42 +08:00
 								            if model_config.is_multimodal_model:
-												[Bugfix] Proper input validation for multi-modal encoder-decoder models (#16156)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-09 00:45:21 +08:00
+								                suggestion = (
-												[V1] `AsyncLLM` Implementation (#9826)

Signed-off-by: Nick Hill <nickhill@us.ibm.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
											
										
										
											2024-11-11 18:05:38 -05:00
+								                    "Make sure that `max_model_len` is no smaller than the "
 								                    "number of text tokens plus multimodal tokens. For image "
 								                    "inputs, the number of image tokens depends on the number "
 								                    "of images, and possibly their aspect ratios as well.")
-												[Bugfix] Proper input validation for multi-modal encoder-decoder models (#16156)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-09 00:45:21 +08:00
+								            else:
 								                suggestion = (
 								                    "Make sure that `max_model_len` is no smaller than the "
 								                    "number of text tokens.")
 								            raise ValueError(
 								                f"The {prompt_type} prompt (length {len(prompt_ids)}) is "
 								                f"longer than the maximum model length of {max_prompt_len}. "
 								                f"{suggestion}")
-												[V1] `AsyncLLM` Implementation (#9826)

Signed-off-by: Nick Hill <nickhill@us.ibm.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
											
										
										
											2024-11-11 18:05:38 -05:00
-												[1/N] Initial prototype for multi-modal processor (#10044)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-11-13 20:39:03 +08:00
+								            # TODO: Find out how many placeholder tokens are there so we can
 								            # check that chunked prefill does not truncate them
 								            # max_batch_len = self.scheduler_config.max_num_batched_tokens
-												[Core] Use key-only cache for `BaseMultiModalProcessor` (#23018)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-08-27 14:19:13 +08:00
 								    def clear_cache(self) -> None:
 								        self.input_preprocessor.clear_cache()