[V0 deprecation] Guided decoding (#21347)
Signed-off-by: Reza Barazesh <rezabarazesh@meta.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import copy
|
||||
import time
|
||||
from collections import Counter as collectionsCounter
|
||||
from collections import deque
|
||||
@@ -36,8 +35,6 @@ from vllm.inputs.preprocess import InputPreprocessor
|
||||
from vllm.logger import init_logger
|
||||
from vllm.logits_process import get_bad_words_logits_processors
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.model_executor.guided_decoding import (
|
||||
get_local_guided_decoding_logits_processor)
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
|
||||
from vllm.multimodal.processing import EncDecMultiModalProcessor
|
||||
@@ -686,11 +683,10 @@ class LLMEngine:
|
||||
"Priority scheduling is not enabled.")
|
||||
|
||||
if isinstance(params, SamplingParams) \
|
||||
and (params.guided_decoding or params.logits_processors) \
|
||||
and params.logits_processors \
|
||||
and self.scheduler_config.num_scheduler_steps > 1:
|
||||
raise ValueError(
|
||||
"Guided decoding and logits processors are not supported "
|
||||
"in multi-step decoding")
|
||||
"Logits processors are not supported in multi-step decoding")
|
||||
|
||||
if arrival_time is None:
|
||||
arrival_time = time.time()
|
||||
@@ -1226,7 +1222,7 @@ class LLMEngine:
|
||||
engine = LLMEngine.from_engine_args(engine_args)
|
||||
example_inputs = [(0, "What is LLM?",
|
||||
SamplingParams(temperature=0.0))]
|
||||
|
||||
|
||||
# Start the engine with an event loop
|
||||
while True:
|
||||
if example_inputs:
|
||||
@@ -1983,43 +1979,13 @@ class LLMEngine:
|
||||
def _build_logits_processors(
|
||||
self, sampling_params: SamplingParams,
|
||||
lora_request: Optional[LoRARequest]) -> SamplingParams:
|
||||
"""Constructs logits processors based on the guided_decoding,
|
||||
logits_bias, and allowed_token_ids fields in sampling_params. Deletes
|
||||
those fields and adds the constructed logits processors to the
|
||||
logits_processors field. Returns the modified sampling params."""
|
||||
"""Constructs logits processors based on the logits_bias, and
|
||||
allowed_token_ids fields in sampling_params. Deletes those fields and
|
||||
adds the constructed logits processors to the logits_processors field.
|
||||
Returns the modified sampling params."""
|
||||
|
||||
logits_processors = []
|
||||
|
||||
if sampling_params.guided_decoding is not None:
|
||||
# Defensively copy sampling params since guided decoding logits
|
||||
# processors can have different state for each request
|
||||
sampling_params = copy.copy(sampling_params)
|
||||
guided_decoding = sampling_params.guided_decoding
|
||||
|
||||
logger.debug(
|
||||
"Building guided decoding logits processor in "
|
||||
"LLMEngine. Params: %s", guided_decoding)
|
||||
|
||||
tokenizer = self.get_tokenizer(lora_request=lora_request)
|
||||
guided_decoding.backend = guided_decoding.backend or \
|
||||
self.decoding_config.backend
|
||||
|
||||
if self.decoding_config.reasoning_backend:
|
||||
logger.debug("Building with reasoning backend %s",
|
||||
self.decoding_config.reasoning_backend)
|
||||
|
||||
processor = get_local_guided_decoding_logits_processor(
|
||||
guided_params=guided_decoding,
|
||||
tokenizer=tokenizer,
|
||||
model_config=self.model_config,
|
||||
reasoning_backend=self.decoding_config.reasoning_backend,
|
||||
)
|
||||
if processor:
|
||||
logits_processors.append(processor)
|
||||
|
||||
# Unset so this doesn't get passed down to the model
|
||||
sampling_params.guided_decoding = None
|
||||
|
||||
if (sampling_params.logit_bias or sampling_params.allowed_token_ids):
|
||||
tokenizer = self.get_tokenizer(lora_request=lora_request)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user