[V1] Support disable_any_whtespace for guidance backend (#15584)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-03-28 11:46:45 -04:00
parent 541d1df486
commit 7329ff5468
6 changed files with 44 additions and 117 deletions
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -121,7 +121,8 @@ class Processor:
            return

        supported_backends = [
-            "xgrammar", "xgrammar:disable-any-whitespace", "guidance", "auto"
+            "xgrammar", "xgrammar:disable-any-whitespace", "guidance",
+            "guidance:disable-any-whitespace", "auto"
        ]
        engine_level_backend = self.decoding_config.guided_decoding_backend
        if engine_level_backend not in supported_backends:
@@ -140,11 +141,10 @@ class Processor:
            raise ValueError("Structured output is not supported on TPU.")

        # Request content validation
-
-        if engine_level_backend == "xgrammar":
+        if engine_level_backend.startswith("xgrammar"):
            # xgrammar with no fallback
            validate_structured_output_request_xgrammar(params)
-            params.guided_decoding.backend = "xgrammar"
+            params.guided_decoding.backend = engine_level_backend
        elif engine_level_backend == "auto":
            # "auto" is an opt-in to opinionated behavior where we try to
            # choose a backend based on request contents. This is not the
@@ -158,12 +158,13 @@ class Processor:
                # are not supported in xgrammar. Fall back to guidance.
                params.guided_decoding.backend = "guidance"

-        if params.guided_decoding.backend == "guidance":
+        if engine_level_backend.startswith("guidance"):
            # TODO ideally we would have the LLTokenizer here as Lark syntax
            # allows <|special_token|> and similar, see
            # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens
            # Without tokenizer these are disallowed in grammars.
            validate_guidance_grammar(params, tokenizer=None)
+            params.guided_decoding.backend = engine_level_backend

    def process_inputs(
        self,