[Chore] Cleanup guided namespace, move to structured outputs config (#22772)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-18 05:20:27 -04:00
parent 05b044e698
commit 29283e8976
51 changed files with 579 additions and 806 deletions
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -15,12 +15,13 @@ import torch
 from pydantic import BaseModel

 from tests.reasoning.utils import run_reasoning_extraction
+from vllm.config import StructuredOutputsConfig
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.entrypoints.llm import LLM
 from vllm.outputs import RequestOutput
 from vllm.platforms import current_platform
 from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager
-from vllm.sampling_params import GuidedDecodingParams, SamplingParams
+from vllm.sampling_params import SamplingParams, StructuredOutputsParams

 if TYPE_CHECKING:
    from vllm.config import TokenizerMode
@@ -90,7 +91,7 @@ def _load_json(s: str, backend: str) -> str:

@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize(
-    "model_name, guided_decoding_backend, tokenizer_mode, speculative_config",
+    "model_name, backend, tokenizer_mode, speculative_config",
    PARAMS_MODELS_BACKENDS_TOKENIZER_MODE)
 def test_structured_output(
    monkeypatch: pytest.MonkeyPatch,
@@ -99,8 +100,8 @@ def test_structured_output(
    sample_sql_ebnf: str,
    sample_sql_lark: str,
    sample_regex: str,
-    sample_guided_choice: str,
-    guided_decoding_backend: str,
+    sample_structured_outputs_choices: str,
+    backend: str,
    tokenizer_mode: str,
    model_name: str,
    speculative_config: dict[str, Any],
@@ -115,16 +116,15 @@ def test_structured_output(
    enforce_eager = bool(not current_platform.is_tpu())
    # Use a single LLM instance for several scenarios to
    # speed up the test suite.
-    llm = LLM(
-        model=model_name,
-        enforce_eager=enforce_eager,
-        max_model_len=1024,
-        guided_decoding_backend=guided_decoding_backend,
-        guided_decoding_disable_any_whitespace=(guided_decoding_backend
-                                                in {"xgrammar", "guidance"}),
-        seed=120,
-        tokenizer_mode=tokenizer_mode,
-        speculative_config=speculative_config)
+    llm = LLM(model=model_name,
+              enforce_eager=enforce_eager,
+              max_model_len=1024,
+              structured_outputs_config=dict(backend=backend,
+                                             disable_any_whitespace=backend
+                                             in {"xgrammar", "guidance"}),
+              seed=120,
+              tokenizer_mode=tokenizer_mode,
+              speculative_config=speculative_config)

    #
    # Test 1: Generate JSON output based on a provided schema
@@ -132,7 +132,7 @@ def test_structured_output(
    sampling_params = SamplingParams(
        temperature=1.0,
        max_tokens=4096,
-        guided_decoding=GuidedDecodingParams(json=sample_json_schema))
+        structured_outputs=StructuredOutputsParams(json=sample_json_schema))

    prompt = ("Give an example JSON for an employee profile that fits this "
              "schema. Make the response as short as possible. Schema: "
@@ -152,7 +152,7 @@ def test_structured_output(

        generated_text = output.outputs[0].text
        assert generated_text is not None
-        if guided_decoding_backend != 'lm-format-enforcer':
+        if backend != 'lm-format-enforcer':
            assert "\n" not in generated_text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
        output_json = json.loads(generated_text)
@@ -161,12 +161,12 @@ def test_structured_output(
    #
    # Test 2: Generate JSON object without a schema
    #
-    if guided_decoding_backend != "outlines":
+    if backend != "outlines":
        sampling_params = SamplingParams(
            temperature=1.0,
            max_tokens=4096,
            n=2,
-            guided_decoding=GuidedDecodingParams(json_object=True))
+            structured_outputs=StructuredOutputsParams(json_object=True))

        outputs = llm.generate(prompts=(
            "Generate a JSON object with curly braces for a person with "
@@ -195,8 +195,9 @@ def test_structured_output(
    sampling_params = SamplingParams(
        temperature=1.0,
        max_tokens=4096,
-        guided_decoding=GuidedDecodingParams(json=unsupported_json_schema))
-    if guided_decoding_backend.startswith("xgrammar"):
+        structured_outputs=StructuredOutputsParams(
+            json=unsupported_json_schema))
+    if backend.startswith("xgrammar"):
        with pytest.raises(ValueError,
                           match="The provided JSON schema contains features "
                           "not supported by xgrammar."):
@@ -230,7 +231,7 @@ def test_structured_output(
            parsed_json = json.loads(generated_text)
            assert isinstance(parsed_json, dict)

-    if guided_decoding_backend not in ["outlines", "lm-format-enforcer"]:
+    if backend not in ["outlines", "lm-format-enforcer"]:
        #
        # Test 4: Generate SQL statement using EBNF grammar
        #
@@ -238,7 +239,8 @@ def test_structured_output(
            temperature=0.8,
            top_p=0.95,
            max_tokens=1000,
-            guided_decoding=GuidedDecodingParams(grammar=sample_sql_ebnf))
+            structured_outputs=StructuredOutputsParams(
+                grammar=sample_sql_ebnf))
        outputs = llm.generate(
            ("Generate a sql statement that selects col_1 from "
             "table_1 where it is equal to 1. Make the response as short as "
@@ -271,7 +273,8 @@ def test_structured_output(
            temperature=0.8,
            top_p=0.95,
            max_tokens=1000,
-            guided_decoding=GuidedDecodingParams(grammar=sample_sql_lark))
+            structured_outputs=StructuredOutputsParams(
+                grammar=sample_sql_lark))
        outputs = llm.generate(
            ("Generate a sql statement that selects col_1 from "
             "table_1 where it is equal to 1. Make the response as short as "
@@ -309,7 +312,8 @@ def test_structured_output(
            temperature=0.8,
            top_p=0.95,
            max_tokens=1000,
-            guided_decoding=GuidedDecodingParams(grammar="not a grammar"))
+            structured_outputs=StructuredOutputsParams(
+                grammar="not a grammar"))
        with pytest.raises(ValueError, match="Failed to convert the grammar "):
            llm.generate(
                ("Generate a sql statement that selects col_1 from "
@@ -325,7 +329,7 @@ def test_structured_output(
    sampling_params = SamplingParams(
        temperature=0.8,
        top_p=0.95,
-        guided_decoding=GuidedDecodingParams(regex=sample_regex))
+        structured_outputs=StructuredOutputsParams(regex=sample_regex))

    prompt = (f"Give an example IPv4 address with this regex: {sample_regex}. "
              f"Make the response as short as possible.")
@@ -352,7 +356,8 @@ def test_structured_output(
    sampling_params = SamplingParams(
        temperature=0.8,
        top_p=0.95,
-        guided_decoding=GuidedDecodingParams(choice=sample_guided_choice))
+        structured_outputs=StructuredOutputsParams(
+            choice=sample_structured_outputs_choices))

    outputs = llm.generate(
        ("The best language for type-safe systems programming is "
@@ -368,7 +373,7 @@ def test_structured_output(
        generated_text = output.outputs[0].text
        print(generated_text)
        assert generated_text is not None
-        assert generated_text in sample_guided_choice
+        assert generated_text in sample_structured_outputs_choices
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

    #
@@ -378,7 +383,7 @@ def test_structured_output(
    sampling_params = SamplingParams(
        temperature=1.0,
        max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(json=json_schema))
+        structured_outputs=StructuredOutputsParams(json=json_schema))

    outputs = llm.generate(
        ("Generate a JSON with the brand, model and car_type of the most "
@@ -422,7 +427,7 @@ def test_structured_output(
    sampling_params = SamplingParams(
        temperature=1.0,
        max_tokens=4096,
-        guided_decoding=GuidedDecodingParams(json=json_schema))
+        structured_outputs=StructuredOutputsParams(json=json_schema))

    outputs = llm.generate(
        ("Generate a description of a frog using 50 characters. "
@@ -444,7 +449,7 @@ def test_structured_output(
        output_json = json.loads(generated_text)
        jsonschema.validate(instance=output_json, schema=json_schema)

-    if guided_decoding_backend not in ["outlines", "lm-format-enforcer"]:
+    if backend not in ["outlines", "lm-format-enforcer"]:
        #
        # Test 11: Generate structured output using structural_tag format
        #
@@ -470,7 +475,7 @@ def test_structured_output(
        sampling_params = SamplingParams(
            temperature=0.0,
            max_tokens=4096,
-            guided_decoding=GuidedDecodingParams(
+            structured_outputs=StructuredOutputsParams(
                structural_tag=json.dumps(structural_tag_config)))

        prompt = """
@@ -547,7 +552,7 @@ Make the response as short as possible.

@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize(
-    "model_name, guided_decoding_backend, tokenizer_mode, reasoning_parser, speculative_config",  # noqa: E501
+    "model_name, backend, tokenizer_mode, reasoning_parser, speculative_config",  # noqa: E501
    [
        ("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", "xgrammar", "auto",
         "deepseek_r1", NGRAM_SPEC_CONFIG),
@@ -556,7 +561,7 @@ Make the response as short as possible.
 )
 def test_structured_output_with_reasoning_matrices(
    monkeypatch: pytest.MonkeyPatch,
-    guided_decoding_backend: str,
+    backend: str,
    tokenizer_mode: TokenizerMode,
    reasoning_parser: str,
    model_name: str,
@@ -576,10 +581,11 @@ def test_structured_output_with_reasoning_matrices(
        enforce_eager=bool(not current_platform.is_tpu()),
        max_model_len=1024,
        max_num_seqs=16,
-        guided_decoding_backend=guided_decoding_backend,
-        guided_decoding_disable_any_whitespace=True,
+        structured_outputs_config=dict(backend=backend,
+                                       disable_any_whitespace=backend
+                                       in {"xgrammar", "guidance"},
+                                       reasoning_parser=reasoning_parser),
        tokenizer_mode=tokenizer_mode,
-        reasoning_parser=reasoning_parser,
        speculative_config=speculative_config,
    )
    tokenizer = llm.get_tokenizer()
@@ -603,7 +609,7 @@ def test_structured_output_with_reasoning_matrices(
    sampling_params = SamplingParams(
        temperature=0.1,
        max_tokens=8192,
-        guided_decoding=GuidedDecodingParams(json=reasoning_schema),
+        structured_outputs=StructuredOutputsParams(json=reasoning_schema),
    )
    outputs = llm.generate(
        [reasoning_prompt],
@@ -640,13 +646,14 @@ def test_structured_output_auto_mode(

    llm = LLM(model=model_name,
              max_model_len=1024,
-              guided_decoding_backend="auto",
+              structured_outputs_config=dict(backend="auto"),
              tokenizer_mode=tokenizer_mode)

    sampling_params = SamplingParams(
        temperature=1.0,
        max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(json=unsupported_json_schema))
+        structured_outputs=StructuredOutputsParams(
+            json=unsupported_json_schema))

    prompts = (
        "Give an example JSON object for a grade "
@@ -681,9 +688,10 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):

    llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
              max_model_len=1024,
-              guided_decoding_backend="guidance",
-              guided_decoding_disable_any_whitespace=True,
-              guided_decoding_disable_additional_properties=True)
+              structured_outputs_config=dict(
+                  backend="guidance",
+                  disable_any_whitespace=True,
+                  disable_additional_properties=True))

    schema = {
        'type': 'object',
@@ -709,14 +717,15 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
        "<|im_end|>\n<|im_start|>assistant\n")

    def generate_with_backend(backend):
-        guided_params = GuidedDecodingParams(
+        structured_outputs_params = StructuredOutputsParams(
            json=schema,
            backend=backend,
            disable_any_whitespace=True,
            disable_additional_properties=True)
-        sampling_params = SamplingParams(temperature=0,
-                                         max_tokens=256,
-                                         guided_decoding=guided_params)
+        sampling_params = SamplingParams(
+            temperature=0,
+            max_tokens=256,
+            structured_outputs=structured_outputs_params)

        outputs = llm.generate(prompt, sampling_params=sampling_params)
        assert outputs is not None
@@ -736,12 +745,11 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
    assert "a6" not in generated


-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["guidance", "xgrammar", "outlines"])
-def test_structured_output_batched_with_non_guided_requests(
+@pytest.mark.parametrize("backend", ["guidance", "xgrammar", "outlines"])
+def test_structured_output_batched_with_non_structured_outputs_requests(
    monkeypatch: pytest.MonkeyPatch,
    sample_json_schema: dict[str, Any],
-    guided_decoding_backend: str,
+    backend: str,
 ):
    monkeypatch.setenv("VLLM_USE_V1", "1")

@@ -753,24 +761,25 @@ def test_structured_output_batched_with_non_guided_requests(
        model="meta-llama/Meta-Llama-3.1-8B-Instruct",
        enforce_eager=enforce_eager,
        max_model_len=1024,
-        guided_decoding_backend=guided_decoding_backend,
-        guided_decoding_disable_any_whitespace=(guided_decoding_backend
-                                                in {"xgrammar", "guidance"}),
+        structured_outputs_config=StructuredOutputsConfig(
+            backend=backend,
+            disable_any_whitespace=backend in {"xgrammar", "guidance"},
+        ),
    )

-    guided_prompt = (
+    structured_outputs_prompt = (
        "Give an example JSON for an employee profile that fits this "
        "schema. Make the response as short as possible. Schema: "
        f"{sample_json_schema}")

-    non_guided_prompt = "The diameter of the Earth in kilometers is "
+    non_structured_outputs_prompt = "The diameter of the Earth in kilometers is "

-    prompts = [guided_prompt, non_guided_prompt]
+    prompts = [structured_outputs_prompt, non_structured_outputs_prompt]
    sampling_params = [
-        SamplingParams(
-            temperature=1.0,
-            max_tokens=400,
-            guided_decoding=GuidedDecodingParams(json=sample_json_schema)),
+        SamplingParams(temperature=1.0,
+                       max_tokens=400,
+                       structured_outputs=StructuredOutputsParams(
+                           json=sample_json_schema)),
        # No max tokens, temp=0 to assert on contents
        SamplingParams(
            seed=42,
@@ -801,16 +810,16 @@ def test_structured_output_batched_with_non_guided_requests(
        print(f"Prompt:\n{prompt!r}\nGenerated text:\n{generated_text!r}")

        if index == 0:
-            # First prompt is guided, expect valid JSON
+            # First prompt is structured outputs, expect valid JSON
            assert "\n" not in generated_text
            output_json = json.loads(generated_text)
            jsonschema.validate(instance=output_json,
                                schema=sample_json_schema)
        else:
-            # Second prompt is not guided, expect valid output
+            # Second prompt is not structured outputs, expect valid output
            # Cannot assert on exact output, but we can expect it to be factual
            assert "12,742" in generated_text

-            # non-guided requests should not return a valid JSON here
+            # non-structured outputs requests should not return a valid JSON here
            with pytest.raises(ValueError):
                output_json = json.loads(generated_text)