[V1] guidance backend for structured output + auto fallback mode (#14779)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Loc Huynh <jc1da.3011@gmail.com>
Co-authored-by: Michal Moskal <michal@moskal.me>
This commit is contained in:
Russell Bryant
2025-03-25 00:02:33 -04:00
committed by GitHub
parent 10b34e36b9
commit a09ad90a72
9 changed files with 344 additions and 110 deletions

View File

@@ -4,7 +4,6 @@ import time
from collections.abc import Mapping
from typing import Optional, Union
import vllm.platforms
from vllm.config import VllmConfig
from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
PromptType, SingletonInputsAdapter)
@@ -20,7 +19,10 @@ from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import SamplingParams
from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.structured_output.utils import validate_structured_output_request
from vllm.v1.structured_output.backend_guidance import (
validate_guidance_grammar)
from vllm.v1.structured_output.utils import (
validate_structured_output_request_xgrammar)
class Processor:
@@ -120,7 +122,9 @@ class Processor:
if not params.guided_decoding or not self.decoding_config:
return
supported_backends = ["xgrammar", "xgrammar:disable-any-whitespace"]
supported_backends = [
"xgrammar", "xgrammar:disable-any-whitespace", "guidance", "auto"
]
engine_level_backend = self.decoding_config.guided_decoding_backend
if engine_level_backend not in supported_backends:
raise ValueError(f"Only {supported_backends} structured output is "
@@ -134,10 +138,31 @@ class Processor:
else:
params.guided_decoding.backend = engine_level_backend
if vllm.platforms.current_platform.is_tpu():
raise ValueError("Structured output is not supported on TPU.")
# Request content validation
validate_structured_output_request(params)
if engine_level_backend == "xgrammar":
# xgrammar with no fallback
validate_structured_output_request_xgrammar(params)
params.guided_decoding.backend = "xgrammar"
elif engine_level_backend == "auto":
# "auto" is an opt-in to opinionated behavior where we try to
# choose a backend based on request contents. This is not the
# default as it is less predictable and subject to change
# between releases as feature support changes.
try:
validate_structured_output_request_xgrammar(params)
params.guided_decoding.backend = "xgrammar"
except ValueError:
# The request includes some jsonschema feature(s) that
# are not supported in xgrammar. Fall back to guidance.
params.guided_decoding.backend = "guidance"
if params.guided_decoding.backend == "guidance":
# TODO ideally we would have the LLTokenizer here as Lark syntax
# allows <|special_token|> and similar, see
# https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens
# Without tokenizer these are disallowed in grammars.
validate_guidance_grammar(params, tokenizer=None)
def process_inputs(
self,