[V0 deprecation] Guided decoding (#21347)

Signed-off-by: Reza Barazesh <rezabarazesh@meta.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Reza Barazesh
2025-07-29 03:15:30 -07:00
committed by GitHub
parent a4528f0cac
commit 37efc63b64
29 changed files with 103 additions and 2809 deletions

View File

@@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import copy
import time
from collections import Counter as collectionsCounter
from collections import deque
@@ -36,8 +35,6 @@ from vllm.inputs.preprocess import InputPreprocessor
from vllm.logger import init_logger
from vllm.logits_process import get_bad_words_logits_processors
from vllm.lora.request import LoRARequest
from vllm.model_executor.guided_decoding import (
get_local_guided_decoding_logits_processor)
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
from vllm.multimodal.processing import EncDecMultiModalProcessor
@@ -686,11 +683,10 @@ class LLMEngine:
"Priority scheduling is not enabled.")
if isinstance(params, SamplingParams) \
and (params.guided_decoding or params.logits_processors) \
and params.logits_processors \
and self.scheduler_config.num_scheduler_steps > 1:
raise ValueError(
"Guided decoding and logits processors are not supported "
"in multi-step decoding")
"Logits processors are not supported in multi-step decoding")
if arrival_time is None:
arrival_time = time.time()
@@ -1226,7 +1222,7 @@ class LLMEngine:
engine = LLMEngine.from_engine_args(engine_args)
example_inputs = [(0, "What is LLM?",
SamplingParams(temperature=0.0))]
# Start the engine with an event loop
while True:
if example_inputs:
@@ -1983,43 +1979,13 @@ class LLMEngine:
def _build_logits_processors(
self, sampling_params: SamplingParams,
lora_request: Optional[LoRARequest]) -> SamplingParams:
"""Constructs logits processors based on the guided_decoding,
logits_bias, and allowed_token_ids fields in sampling_params. Deletes
those fields and adds the constructed logits processors to the
logits_processors field. Returns the modified sampling params."""
"""Constructs logits processors based on the logits_bias, and
allowed_token_ids fields in sampling_params. Deletes those fields and
adds the constructed logits processors to the logits_processors field.
Returns the modified sampling params."""
logits_processors = []
if sampling_params.guided_decoding is not None:
# Defensively copy sampling params since guided decoding logits
# processors can have different state for each request
sampling_params = copy.copy(sampling_params)
guided_decoding = sampling_params.guided_decoding
logger.debug(
"Building guided decoding logits processor in "
"LLMEngine. Params: %s", guided_decoding)
tokenizer = self.get_tokenizer(lora_request=lora_request)
guided_decoding.backend = guided_decoding.backend or \
self.decoding_config.backend
if self.decoding_config.reasoning_backend:
logger.debug("Building with reasoning backend %s",
self.decoding_config.reasoning_backend)
processor = get_local_guided_decoding_logits_processor(
guided_params=guided_decoding,
tokenizer=tokenizer,
model_config=self.model_config,
reasoning_backend=self.decoding_config.reasoning_backend,
)
if processor:
logits_processors.append(processor)
# Unset so this doesn't get passed down to the model
sampling_params.guided_decoding = None
if (sampling_params.logit_bias or sampling_params.allowed_token_ids):
tokenizer = self.get_tokenizer(lora_request=lora_request)