[V1] Set structured output backend to auto by default (#15724)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-04-10 13:53:26 -04:00
parent 0c54fc7273
commit 9665313c39
4 changed files with 22 additions and 68 deletions
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -20,8 +20,6 @@ from .test_completion import zephyr_lora_files  # noqa: F401
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"

-GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
-

@pytest.fixture(scope="module")
 def monkeypatch_module():
@@ -487,20 +485,9 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
    assert last_completion_tokens == 10


-# NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
-# (i.e. using the same ordering as in the Completions API tests), the test
-# will fail on the second `guided_decoding_backend` even when I swap their order
-# (ref: https://github.com/vllm-project/vllm/pull/5526#issuecomment-2173772256)
@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_choice_chat(client: openai.AsyncOpenAI,
-                                  is_v1_server: bool,
-                                  guided_decoding_backend: str,
                                  sample_guided_choice):
-
-    if is_v1_server and guided_decoding_backend != 'xgrammar':
-        pytest.skip("Only xgrammar backend is supported with V1")
-
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@@ -515,8 +502,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
        messages=messages,
        max_completion_tokens=10,
        temperature=0.7,
-        extra_body=dict(guided_choice=sample_guided_choice,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_choice=sample_guided_choice))
    choice1 = chat_completion.choices[0].message.content
    assert choice1 in sample_guided_choice

@@ -530,22 +516,16 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
        messages=messages,
        max_completion_tokens=10,
        temperature=0.7,
-        extra_body=dict(guided_choice=sample_guided_choice,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_choice=sample_guided_choice))
    choice2 = chat_completion.choices[0].message.content
    assert choice2 in sample_guided_choice
    assert choice1 != choice2


@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
-                                guided_decoding_backend: str,
+async def test_guided_json_chat(client: openai.AsyncOpenAI,
                                sample_json_schema):

-    if is_v1_server:
-        pytest.skip("sample_json_schema has features unsupported in V1")
-
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@@ -560,8 +540,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
        model=MODEL_NAME,
        messages=messages,
        max_completion_tokens=1000,
-        extra_body=dict(guided_json=sample_json_schema,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_json=sample_json_schema))
    message = chat_completion.choices[0].message
    assert message.content is not None
    json1 = json.loads(message.content)
@@ -578,8 +557,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
        model=MODEL_NAME,
        messages=messages,
        max_completion_tokens=1000,
-        extra_body=dict(guided_json=sample_json_schema,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_json=sample_json_schema))
    message = chat_completion.choices[0].message
    assert message.content is not None
    json2 = json.loads(message.content)
@@ -589,13 +567,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,


@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_guided_regex_chat(client: openai.AsyncOpenAI,
-                                 is_v1_server: bool,
-                                 guided_decoding_backend: str, sample_regex):
-
-    if is_v1_server and guided_decoding_backend != 'xgrammar':
-        pytest.skip("Only xgrammar backend is supported with V1")
+async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex):

    messages = [{
        "role": "system",
@@ -610,8 +582,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
        model=MODEL_NAME,
        messages=messages,
        max_completion_tokens=20,
-        extra_body=dict(guided_regex=sample_regex,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_regex=sample_regex))
    ip1 = chat_completion.choices[0].message.content
    assert ip1 is not None
    assert re.fullmatch(sample_regex, ip1) is not None
@@ -622,8 +593,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
        model=MODEL_NAME,
        messages=messages,
        max_completion_tokens=20,
-        extra_body=dict(guided_regex=sample_regex,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_regex=sample_regex))
    ip2 = chat_completion.choices[0].message.content
    assert ip2 is not None
    assert re.fullmatch(sample_regex, ip2) is not None
@@ -652,15 +622,9 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):


@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
-                                           is_v1_server: bool,
-                                           guided_decoding_backend: str,
                                           sample_guided_choice):

-    if is_v1_server and guided_decoding_backend != 'xgrammar':
-        pytest.skip("Only xgrammar backend is supported with V1")
-
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@@ -676,8 +640,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
        max_completion_tokens=10,
        logprobs=True,
        top_logprobs=5,
-        extra_body=dict(guided_choice=sample_guided_choice,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_choice=sample_guided_choice))

    assert chat_completion.choices[0].logprobs is not None
    assert chat_completion.choices[0].logprobs.content is not None
@@ -689,14 +652,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,


@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
-                              guided_decoding_backend: str,
-                              sample_json_schema):
-
-    if is_v1_server:
-        pytest.skip("sample_json_schema has features unsupported on V1")
-
+async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema):
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@@ -728,7 +684,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
                "name": "dummy_function_name"
            }
        },
-        extra_body=dict(guided_decoding_backend=guided_decoding_backend))
+    )
    message = chat_completion.choices[0].message
    assert len(message.content) == 0
    json_string = message.tool_calls[0].function.arguments
@@ -763,7 +719,6 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
                "name": "dummy_function_name"
            }
        },
-        extra_body=dict(guided_decoding_backend=guided_decoding_backend),
        stream=True)

    output = []
@@ -888,7 +843,6 @@ async def test_required_tool_use(client: openai.AsyncOpenAI,
        model=model_name,
        tools=tools,
        tool_choice="required",
-        extra_body=dict(guided_decoding_backend="outlines"),
    )

    assert chat_completion.choices[0].message.tool_calls is not None
@@ -900,7 +854,6 @@ async def test_required_tool_use(client: openai.AsyncOpenAI,
        model=model_name,
        tools=tools,
        tool_choice="required",
-        extra_body=dict(guided_decoding_backend="outlines"),
        stream=True,
    )

@@ -914,12 +867,7 @@ async def test_required_tool_use(client: openai.AsyncOpenAI,

@pytest.mark.asyncio
 async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
-                                                  is_v1_server: bool,
                                                  sample_json_schema):
-
-    if is_v1_server:
-        pytest.skip("sample_json_schema has features unsupported on V1")
-
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"