2025-05-14 18:45:24 -04:00
# ruff: noqa: E501
2025-03-07 10:19:11 -05:00
# SPDX-License-Identifier: Apache-2.0
2025-06-03 11:20:17 -07:00
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
2025-03-07 10:19:11 -05:00
import json
2025-03-28 21:14:53 +08:00
from enum import Enum
2025-12-01 19:34:58 +08:00
from typing import Any
2025-03-07 10:19:11 -05:00
import jsonschema
import pytest
2025-05-24 07:16:26 +08:00
import regex as re
2025-08-22 10:31:24 -07:00
import torch
2025-03-28 21:14:53 +08:00
from pydantic import BaseModel
2025-03-07 10:19:11 -05:00
2025-05-14 18:45:24 -04:00
from tests . reasoning . utils import run_reasoning_extraction
2025-09-18 05:20:27 -04:00
from vllm . config import StructuredOutputsConfig
2025-08-22 10:31:24 -07:00
from vllm . distributed import cleanup_dist_env_and_memory
2025-03-07 10:19:11 -05:00
from vllm . entrypoints . llm import LLM
from vllm . outputs import RequestOutput
2025-04-24 03:50:09 -06:00
from vllm . platforms import current_platform
2025-05-14 18:45:24 -04:00
from vllm . reasoning . abs_reasoning_parsers import ReasoningParserManager
2025-09-23 17:07:30 +01:00
from vllm . sampling_params import (
SamplingParams ,
StructuredOutputsParams ,
)
2025-03-07 10:19:11 -05:00
2026-03-20 05:50:34 -04:00
SAMPLE_REGEX = (
r " ((25[0-5]|(2[0-4]|1 \ d|[1-9]|) \ d) \ .) {3} "
r " (25[0-5]|(2[0-4]|1 \ d|[1-9]|) \ d) "
)
# Note: Ensure this only uses attributes compatible with xgrammar
SAMPLE_JSON_SCHEMA = {
" type " : " object " ,
" properties " : {
" name " : { " type " : " string " } ,
" age " : { " type " : " integer " } ,
" skills " : {
" type " : " array " ,
" items " : {
" type " : " string " ,
} ,
} ,
" grade " : {
" type " : " string " ,
" pattern " : " ^[A-D]$ " , # Regex pattern
} ,
" email " : {
" type " : " string " ,
" pattern " : " ^[a-zA-Z0-9._ % +-]+@[a-zA-Z0-9.-]+ \\ .[a-zA-Z] { 2,}$ " ,
} ,
" work_history " : {
" type " : " array " ,
" items " : {
" type " : " object " ,
" properties " : {
" company " : { " type " : " string " } ,
" duration " : {
" type " : " number " ,
" minimum " : 0.0 ,
" maximum " : 100.0 , # Numeric range
} ,
" position " : { " type " : " string " } ,
} ,
" required " : [ " company " , " duration " , " position " ] ,
" additionalProperties " : False ,
} ,
" minItems " : 0 ,
" maxItems " : 3 ,
} ,
} ,
" required " : [ " name " , " age " , " skills " , " grade " , " email " , " work_history " ] ,
" additionalProperties " : False ,
" minProperties " : 1 ,
" maxProperties " : 10 ,
}
# A schema unsupported by xgrammar
UNSUPPORTED_JSON_SCHEMA = {
" type " : " object " ,
" properties " : {
" score " : {
" type " : " integer " ,
" multipleOf " : 5 , # Numeric multiple
} ,
" tags " : {
" type " : " array " ,
" items " : { " type " : " string " , " minLength " : 10 , " maxLength " : 20 } ,
} ,
} ,
" required " : [ " score " , " tags " ] ,
" additionalProperties " : False ,
" patternProperties " : {
" ^score$ " : { " type " : " integer " } ,
} ,
}
SAMPLE_STRUCTURED_OUTPUTS_CHOICES = [
" Python " ,
" Java " ,
" JavaScript " ,
" C++ " ,
" C# " ,
" PHP " ,
" TypeScript " ,
" Ruby " ,
" Swift " ,
" Kotlin " ,
]
SAMPLE_SQL_EBNF = """
root : := select_statement
select_statement : := " SELECT " column " from " table " where " condition
column : := " col_1 " | " col_2 "
table : := " table_1 " | " table_2 "
condition : := column " = " number
number : := " 1 " | " 2 "
"""
SAMPLE_SQL_LARK = """
start : select_statement
select_statement : " SELECT " column " from " table " where " condition
column : " col_1 " | " col_2 "
table : " table_1 " | " table_2 "
condition : column " = " number
number : " 1 " | " 2 "
"""
2025-04-29 17:02:10 -07:00
NGRAM_SPEC_CONFIG = {
" model " : " [ngram] " ,
" num_speculative_tokens " : 5 ,
" prompt_lookup_max " : 5 ,
" prompt_lookup_min " : 1 ,
}
EAGLE_SPEC_CONFIG = {
" method " : " eagle " ,
" model " : " yuhuili/EAGLE-LLaMA3.1-Instruct-8B " ,
" num_speculative_tokens " : 5 ,
}
2025-03-30 05:20:19 +02:00
PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
2025-04-29 17:02:10 -07:00
( " mistralai/Ministral-8B-Instruct-2410 " , " xgrammar " , " auto " , None ) ,
2025-11-21 22:58:59 +01:00
# FIXME: Since "auto" will use Mistral tokenizer and these backends do not support
# it, we skip these tests for now.
# ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", None),
# ("mistralai/Ministral-8B-Instruct-2410", "lm-format-enforcer", "auto", None),
( " mistralai/Ministral-8B-Instruct-2410 " , " guidance " , " hf " , None ) ,
2025-11-20 15:35:14 -06:00
pytest . param (
" mistralai/Ministral-8B-Instruct-2410 " ,
" lm-format-enforcer " ,
2025-11-21 22:58:59 +01:00
" hf " ,
2025-11-20 15:35:14 -06:00
None ,
marks = pytest . mark . skip (
reason = (
" Flaky: lm-format-enforcer intermittently returns "
" incomplete JSON. "
" See https://github.com/noamgat/lm-format-enforcer/issues/169 "
)
) ,
) ,
2025-04-29 17:02:10 -07:00
( " mistralai/Ministral-8B-Instruct-2410 " , " xgrammar " , " mistral " , None ) ,
( " Qwen/Qwen2.5-1.5B-Instruct " , " xgrammar " , " auto " , None ) ,
2025-11-20 15:35:14 -06:00
pytest . param (
" Qwen/Qwen2.5-1.5B-Instruct " ,
" lm-format-enforcer " ,
" auto " ,
None ,
marks = pytest . mark . skip (
reason = (
" Flaky: lm-format-enforcer intermittently returns "
" incomplete JSON. "
" See https://github.com/noamgat/lm-format-enforcer/issues/169 "
)
) ,
) ,
2025-09-07 19:51:59 -07:00
# FIXME: This tests are flaky on CI thus disabled. Tracking in Issue #24402
# ("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto", None),
# ("mistralai/Ministral-8B-Instruct-2410", "outlines", "mistral", None),
# ("Qwen/Qwen2.5-1.5B-Instruct", "guidance", "auto"),
2025-07-10 14:30:26 -05:00
( " mistralai/Ministral-8B-Instruct-2410 " , " outlines " , " auto " , NGRAM_SPEC_CONFIG ) ,
2025-11-21 22:58:59 +01:00
( " mistralai/Ministral-8B-Instruct-2410 " , " guidance " , " hf " , NGRAM_SPEC_CONFIG ) ,
2025-04-29 17:02:10 -07:00
( " Qwen/Qwen2.5-1.5B-Instruct " , " xgrammar " , " auto " , NGRAM_SPEC_CONFIG ) ,
( " meta-llama/Meta-Llama-3.1-8B-Instruct " , " xgrammar " , " auto " , EAGLE_SPEC_CONFIG ) ,
2025-03-28 11:46:45 -04:00
]
2025-03-30 05:20:19 +02:00
PARAMS_MODELS_TOKENIZER_MODE = [
( " mistralai/Ministral-8B-Instruct-2410 " , " auto " ) ,
( " Qwen/Qwen2.5-1.5B-Instruct " , " auto " ) ,
2025-03-14 16:55:18 -04:00
]
2025-03-11 22:40:09 -04:00
2026-01-14 22:53:43 -06:00
platform_args = { }
if current_platform . is_rocm ( ) :
platform_args [ " async_scheduling " ] = False
2025-03-11 22:40:09 -04:00
2025-03-29 00:10:45 -04:00
class CarType ( str , Enum ) :
sedan = " sedan "
suv = " SUV "
truck = " Truck "
coupe = " Coupe "
class CarDescription ( BaseModel ) :
brand : str
model : str
car_type : CarType
2025-04-29 17:02:10 -07:00
@pytest.mark.parametrize (
2025-09-18 05:20:27 -04:00
" model_name, backend, tokenizer_mode, speculative_config " ,
2025-04-29 17:02:10 -07:00
PARAMS_MODELS_BACKENDS_TOKENIZER_MODE ,
)
2025-03-29 00:10:45 -04:00
def test_structured_output (
2025-09-18 05:20:27 -04:00
backend : str ,
2025-03-30 05:20:19 +02:00
tokenizer_mode : str ,
2025-03-11 22:40:09 -04:00
model_name : str ,
2025-04-29 17:02:10 -07:00
speculative_config : dict [ str , Any ] ,
2025-03-11 22:40:09 -04:00
) :
2026-03-20 05:50:34 -04:00
sample_json_schema = SAMPLE_JSON_SCHEMA
unsupported_json_schema = UNSUPPORTED_JSON_SCHEMA
sample_sql_ebnf = SAMPLE_SQL_EBNF
sample_sql_lark = SAMPLE_SQL_LARK
sample_regex = SAMPLE_REGEX
sample_structured_outputs_choices = SAMPLE_STRUCTURED_OUTPUTS_CHOICES
2025-04-30 21:36:20 -06:00
if current_platform . is_tpu ( ) and speculative_config :
pytest . skip ( " TPU does not support speculative decoding " )
2025-03-29 00:10:45 -04:00
# Use a single LLM instance for several scenarios to
# speed up the test suite.
2025-09-18 05:20:27 -04:00
llm = LLM (
model = model_name ,
2025-10-03 06:38:42 -07:00
enforce_eager = True ,
2025-09-18 05:20:27 -04:00
max_model_len = 1024 ,
structured_outputs_config = dict (
backend = backend , disable_any_whitespace = backend in { " xgrammar " , " guidance " }
) ,
seed = 120 ,
tokenizer_mode = tokenizer_mode ,
2025-11-21 22:58:59 +01:00
load_format = " auto " if not model_name . startswith ( " mistralai/ " ) else " hf " ,
config_format = " auto " if not model_name . startswith ( " mistralai/ " ) else " hf " ,
2025-09-18 05:20:27 -04:00
speculative_config = speculative_config ,
2026-01-14 22:53:43 -06:00
* * platform_args ,
2025-09-18 05:20:27 -04:00
)
2025-03-29 00:10:45 -04:00
#
# Test 1: Generate JSON output based on a provided schema
#
2025-03-25 00:02:33 -04:00
sampling_params = SamplingParams (
temperature = 1.0 ,
2025-05-12 18:31:54 -04:00
max_tokens = 4096 ,
2025-09-18 05:20:27 -04:00
structured_outputs = StructuredOutputsParams ( json = sample_json_schema ) ,
)
2025-08-22 10:56:57 +08:00
prompt = (
" Give an example JSON for an employee profile that fits this "
" schema. Make the response as short as possible. Schema: "
f " { sample_json_schema } "
)
outputs = llm . generate (
[ prompt ] * 2 ,
sampling_params = sampling_params ,
use_tqdm = True ,
)
2025-03-07 10:19:11 -05:00
assert outputs is not None
for output in outputs :
assert output is not None
assert isinstance ( output , RequestOutput )
prompt = output . prompt
generated_text = output . outputs [ 0 ] . text
assert generated_text is not None
2025-09-18 05:20:27 -04:00
if backend != " lm-format-enforcer " :
2025-08-25 05:31:22 +03:00
assert " \n " not in generated_text
2025-03-22 11:56:17 -04:00
print ( f " Prompt: { prompt !r} , Generated text: { generated_text !r} " )
2025-09-23 20:03:31 -04:00
try :
output_json = json . loads ( generated_text )
except json . JSONDecodeError as e :
pytest . fail (
f " Invalid JSON from backend= { backend } : { generated_text !r} \n "
f " Schema: { sample_json_schema } \n Error: { e } "
)
2025-03-22 11:56:17 -04:00
jsonschema . validate ( instance = output_json , schema = sample_json_schema )
2025-03-29 00:10:45 -04:00
#
# Test 2: Generate JSON object without a schema
#
2025-09-18 05:20:27 -04:00
if backend != " outlines " :
2025-07-10 14:30:26 -05:00
sampling_params = SamplingParams (
temperature = 1.0 ,
max_tokens = 4096 ,
n = 2 ,
2025-09-18 05:20:27 -04:00
structured_outputs = StructuredOutputsParams ( json_object = True ) ,
)
2025-03-07 10:19:11 -05:00
2025-07-10 14:30:26 -05:00
outputs = llm . generate (
prompts = (
" Generate a JSON object with curly braces for a person with "
" name and age fields for John Smith who is 31 years old. "
" Make the response as short as possible. "
) ,
sampling_params = sampling_params ,
use_tqdm = True ,
)
2025-03-07 10:19:11 -05:00
2025-07-10 14:30:26 -05:00
assert outputs is not None
for output in outputs :
assert output is not None
assert isinstance ( output , RequestOutput )
2025-03-07 10:19:11 -05:00
2025-07-10 14:30:26 -05:00
for i in range ( 2 ) :
generated_text = output . outputs [ i ] . text
print ( generated_text )
assert generated_text is not None
2025-03-07 10:19:11 -05:00
2025-07-10 14:30:26 -05:00
# Parse to verify it is a valid JSON object
parsed_json = json . loads ( generated_text )
assert isinstance ( parsed_json , dict )
2025-03-07 10:19:11 -05:00
2025-03-29 00:10:45 -04:00
#
# Test 3: test a jsonschema incompatible with xgrammar
#
2025-03-25 00:02:33 -04:00
sampling_params = SamplingParams (
temperature = 1.0 ,
2025-05-12 18:31:54 -04:00
max_tokens = 4096 ,
2025-09-18 05:20:27 -04:00
structured_outputs = StructuredOutputsParams ( json = unsupported_json_schema ) ,
)
if backend . startswith ( " xgrammar " ) :
2025-03-25 00:02:33 -04:00
with pytest . raises (
ValueError ,
match = " The provided JSON schema contains features "
" not supported by xgrammar. " ,
) :
2025-08-22 10:56:57 +08:00
prompt = (
f " Give an example JSON for an employee profile that "
f " fits this schema: { unsupported_json_schema } . "
f " Make the response as short as possible. "
)
2025-05-08 01:34:02 -04:00
llm . generate (
2025-08-22 10:56:57 +08:00
[ prompt ] * 2 ,
2025-05-08 01:34:02 -04:00
sampling_params = sampling_params ,
2025-08-22 10:56:57 +08:00
use_tqdm = True ,
)
2025-03-25 00:02:33 -04:00
else :
2025-08-22 10:56:57 +08:00
prompt = (
f " Give an example JSON object for a grade that "
f " fits this schema: { unsupported_json_schema } . "
f " Make the response as short as possible. "
)
outputs = llm . generate (
prompt ,
sampling_params = sampling_params ,
use_tqdm = True ,
)
2025-03-25 00:02:33 -04:00
assert outputs is not None
for output in outputs :
assert output is not None
assert isinstance ( output , RequestOutput )
generated_text = output . outputs [ 0 ] . text
assert generated_text is not None
print ( generated_text )
# Parse to verify it is valid JSON
parsed_json = json . loads ( generated_text )
assert isinstance ( parsed_json , dict )
2025-03-07 10:19:11 -05:00
2025-09-18 05:20:27 -04:00
if backend not in [ " outlines " , " lm-format-enforcer " ] :
2025-07-10 14:30:26 -05:00
#
# Test 4: Generate SQL statement using EBNF grammar
#
sampling_params = SamplingParams (
temperature = 0.8 ,
top_p = 0.95 ,
max_tokens = 1000 ,
2025-09-18 05:20:27 -04:00
structured_outputs = StructuredOutputsParams ( grammar = sample_sql_ebnf ) ,
)
2025-07-10 14:30:26 -05:00
outputs = llm . generate (
2025-08-22 10:56:57 +08:00
(
" Generate a sql statement that selects col_1 from "
" table_1 where it is equal to 1. Make the response as short as "
" possible. "
) ,
2025-07-10 14:30:26 -05:00
sampling_params = sampling_params ,
use_tqdm = True ,
)
2025-03-07 10:19:11 -05:00
2025-07-10 14:30:26 -05:00
assert outputs is not None
for output in outputs :
assert output is not None
assert isinstance ( output , RequestOutput )
prompt = output . prompt
2025-03-07 10:19:11 -05:00
2025-07-10 14:30:26 -05:00
generated_text = output . outputs [ 0 ] . text
assert generated_text is not None
2025-03-07 10:19:11 -05:00
2025-07-10 14:30:26 -05:00
# remove spaces for comparison b/c we removed them in the grammar
ground_truth = " SELECT col_1 from table_1 where col_1 = 1 " . replace ( " " , " " )
2025-03-07 10:19:11 -05:00
2025-07-10 14:30:26 -05:00
assert generated_text . strip ( ) == ground_truth
2025-03-07 10:19:11 -05:00
2025-07-10 14:30:26 -05:00
print ( f " Prompt: { prompt !r} , Generated text: { generated_text !r} " )
2025-03-07 10:19:11 -05:00
2025-07-10 14:30:26 -05:00
#
# Test 5: Generate SQL statement using Lark grammar
#
sampling_params = SamplingParams (
temperature = 0.8 ,
top_p = 0.95 ,
max_tokens = 1000 ,
2025-09-18 05:20:27 -04:00
structured_outputs = StructuredOutputsParams ( grammar = sample_sql_lark ) ,
)
2025-07-10 14:30:26 -05:00
outputs = llm . generate (
2025-08-22 10:56:57 +08:00
(
" Generate a sql statement that selects col_1 from "
" table_1 where it is equal to 1. Make the response as short as "
" possible. "
) ,
2025-07-10 14:30:26 -05:00
sampling_params = sampling_params ,
use_tqdm = True ,
)
2025-03-07 10:19:11 -05:00
2025-07-10 14:30:26 -05:00
assert outputs is not None
for output in outputs :
assert output is not None
assert isinstance ( output , RequestOutput )
prompt = output . prompt
2025-03-07 10:19:11 -05:00
2025-07-10 14:30:26 -05:00
generated_text = output . outputs [ 0 ] . text
assert generated_text is not None
2025-03-07 10:19:11 -05:00
2025-07-10 14:30:26 -05:00
# use Lark to parse the output, and make sure it's a valid parse tree
from lark import Lark
2025-10-05 15:06:22 +01:00
2025-07-10 14:30:26 -05:00
parser = Lark ( sample_sql_lark )
parser . parse ( generated_text )
2025-03-07 10:19:11 -05:00
2025-07-10 14:30:26 -05:00
# remove spaces for comparison b/c we removed them in the grammar
ground_truth = " SELECT col_1 from table_1 where col_1 = 1 " . replace ( " " , " " )
2025-03-07 10:19:11 -05:00
2025-07-10 14:30:26 -05:00
assert generated_text . strip ( ) == ground_truth
2025-03-07 10:19:11 -05:00
2025-07-10 14:30:26 -05:00
print ( f " Prompt: { prompt !r} , Generated text: { generated_text !r} " )
2025-03-07 10:19:11 -05:00
2025-07-10 14:30:26 -05:00
#
# Test 6: Test invalid grammar input
#
sampling_params = SamplingParams (
temperature = 0.8 ,
top_p = 0.95 ,
max_tokens = 1000 ,
2025-09-18 05:20:27 -04:00
structured_outputs = StructuredOutputsParams ( grammar = " not a grammar " ) ,
)
2025-07-10 14:30:26 -05:00
with pytest . raises ( ValueError , match = " Failed to convert the grammar " ) :
llm . generate (
(
" Generate a sql statement that selects col_1 from "
" table_1 where it is equal to 1. Make the response as short "
" as possible. "
) ,
sampling_params = sampling_params ,
use_tqdm = True ,
)
2025-03-07 10:19:11 -05:00
2025-03-29 00:10:45 -04:00
#
# Test 7: Generate text based on a regex pattern
#
2025-03-25 00:02:33 -04:00
sampling_params = SamplingParams (
temperature = 0.8 ,
top_p = 0.95 ,
2025-09-18 05:20:27 -04:00
structured_outputs = StructuredOutputsParams ( regex = sample_regex ) ,
)
2025-08-22 10:56:57 +08:00
prompt = (
f " Give an example IPv4 address with this regex: { sample_regex } . "
f " Make the response as short as possible. "
)
2025-03-11 11:03:44 -04:00
outputs = llm . generate (
2025-08-22 10:56:57 +08:00
[ prompt ] * 2 ,
2025-03-11 11:03:44 -04:00
sampling_params = sampling_params ,
use_tqdm = True ,
)
2025-03-07 10:19:11 -05:00
2025-03-11 11:03:44 -04:00
assert outputs is not None
for output in outputs :
assert output is not None
assert isinstance ( output , RequestOutput )
prompt = output . prompt
generated_text = output . outputs [ 0 ] . text
print ( generated_text )
assert generated_text is not None
assert re . fullmatch ( sample_regex , generated_text ) is not None
print ( f " Prompt: { prompt !r} , Generated text: { generated_text !r} " )
2025-03-07 10:19:11 -05:00
2025-03-29 00:10:45 -04:00
#
# Test 8: Generate text based on a choices
#
2025-03-25 00:02:33 -04:00
sampling_params = SamplingParams (
temperature = 0.8 ,
top_p = 0.95 ,
2025-09-18 05:20:27 -04:00
structured_outputs = StructuredOutputsParams (
choice = sample_structured_outputs_choices
2025-10-05 15:06:22 +01:00
) ,
2025-09-18 05:20:27 -04:00
)
2025-08-22 10:56:57 +08:00
2025-03-07 10:19:11 -05:00
outputs = llm . generate (
2025-08-22 10:56:57 +08:00
(
" The best language for type-safe systems programming is "
" (Make the response as short as possible.) "
) ,
2025-03-07 10:19:11 -05:00
sampling_params = sampling_params ,
2025-08-22 10:56:57 +08:00
use_tqdm = True ,
)
2025-03-07 10:19:11 -05:00
assert outputs is not None
for output in outputs :
assert output is not None
assert isinstance ( output , RequestOutput )
prompt = output . prompt
generated_text = output . outputs [ 0 ] . text
print ( generated_text )
assert generated_text is not None
2025-09-18 05:20:27 -04:00
assert generated_text in sample_structured_outputs_choices
2025-03-07 10:19:11 -05:00
print ( f " Prompt: { prompt !r} , Generated text: { generated_text !r} " )
2025-03-28 21:14:53 +08:00
2025-03-29 00:10:45 -04:00
#
# Test 9: Generate structured output using a Pydantic model with an enum
#
2025-03-28 21:14:53 +08:00
json_schema = CarDescription . model_json_schema ( )
sampling_params = SamplingParams (
temperature = 1.0 ,
max_tokens = 1000 ,
2025-09-18 05:20:27 -04:00
structured_outputs = StructuredOutputsParams ( json = json_schema ) ,
)
2025-08-22 10:56:57 +08:00
outputs = llm . generate (
(
" Generate a JSON with the brand, model and car_type of the most "
" iconic car from the 90 ' s. Make the response as short as "
" possible. "
) ,
sampling_params = sampling_params ,
use_tqdm = True ,
)
2025-03-28 21:14:53 +08:00
assert outputs is not None
for output in outputs :
assert output is not None
assert isinstance ( output , RequestOutput )
prompt = output . prompt
generated_text = output . outputs [ 0 ] . text
assert generated_text is not None
print ( f " Prompt: { prompt !r} , Generated text: { generated_text !r} " )
2025-09-23 20:03:31 -04:00
try :
output_json = json . loads ( generated_text )
except json . JSONDecodeError as e :
pytest . fail (
f " Invalid JSON from backend= { backend } : { generated_text !r} \n "
f " Schema: { json_schema } \n Error: { e } "
)
2025-03-28 21:14:53 +08:00
jsonschema . validate ( instance = output_json , schema = json_schema )
2025-03-29 00:10:45 -04:00
2025-04-12 08:22:07 +02:00
#
# Test 10: Generate structured with minLength and maxLength
#
min_length = 50
max_length = 50
json_schema = {
" type " : " object " ,
" properties " : {
" description " : {
" type " : " string " ,
" maxLength " : max_length ,
" minLength " : min_length ,
}
} ,
2025-05-12 18:31:54 -04:00
" required " : [ " description " ] ,
" additionalProperties " : False ,
2025-04-12 08:22:07 +02:00
}
sampling_params = SamplingParams (
temperature = 1.0 ,
2025-05-12 18:31:54 -04:00
max_tokens = 4096 ,
2025-09-18 05:20:27 -04:00
structured_outputs = StructuredOutputsParams ( json = json_schema ) ,
)
2025-04-26 10:06:37 -04:00
2025-04-12 08:22:07 +02:00
outputs = llm . generate (
2025-08-22 10:56:57 +08:00
(
" Generate a description of a frog using 50 characters. "
" Make the response as short as possible. "
) ,
2025-04-12 08:22:07 +02:00
sampling_params = sampling_params ,
2025-08-22 10:56:57 +08:00
use_tqdm = True ,
)
2025-04-12 08:22:07 +02:00
assert outputs is not None
for output in outputs :
assert output is not None
assert isinstance ( output , RequestOutput )
prompt = output . prompt
generated_text = output . outputs [ 0 ] . text
assert generated_text is not None
print ( f " Prompt: { prompt !r} , Generated text: { generated_text !r} " )
2025-09-23 20:03:31 -04:00
try :
output_json = json . loads ( generated_text )
except json . JSONDecodeError as e :
pytest . fail (
f " Invalid JSON from backend= { backend } : { generated_text !r} \n "
f " Schema: { json_schema } \n Error: { e } "
)
2025-04-12 08:22:07 +02:00
jsonschema . validate ( instance = output_json , schema = json_schema )
2025-09-18 05:20:27 -04:00
if backend not in [ " outlines " , " lm-format-enforcer " ] :
2025-07-10 14:30:26 -05:00
#
# Test 11: Generate structured output using structural_tag format
#
structural_tag_config = {
" type " : " structural_tag " ,
" structures " : [
{
" begin " : " <function=get_weather> " ,
" schema " : {
" type " : " object " ,
" properties " : { " city " : { " type " : " string " } } ,
" additionalProperties " : False ,
} ,
" end " : " </function> " ,
}
] ,
" triggers " : [ " <function= " ] ,
}
2025-04-26 10:06:37 -04:00
2025-07-10 14:30:26 -05:00
sampling_params = SamplingParams (
temperature = 0.0 ,
max_tokens = 4096 ,
2025-09-18 05:20:27 -04:00
structured_outputs = StructuredOutputsParams (
2025-07-10 14:30:26 -05:00
structural_tag = json . dumps ( structural_tag_config )
2025-10-05 15:06:22 +01:00
) ,
)
2025-04-26 10:06:37 -04:00
2025-07-10 14:30:26 -05:00
prompt = """
2025-04-26 10:06:37 -04:00
You have access to the following function to retrieve the weather in a city :
2025-05-14 18:45:24 -04:00
2025-04-26 10:06:37 -04:00
{
" name " : " get_weather " ,
" parameters " : {
" city " : {
" param_type " : " string " ,
" description " : " The city to get the weather for " ,
" required " : True
}
}
}
2025-05-14 18:45:24 -04:00
2025-04-26 10:06:37 -04:00
If a you choose to call a function ONLY reply in the following format :
< { start_tag } = { function_name } > { parameters } { end_tag }
where
start_tag = > ` < function `
parameters = > a JSON dict with the function argument name
2025-07-10 14:30:26 -05:00
as key and function argument value as value .
2025-04-26 10:06:37 -04:00
end_tag = > ` < / function > `
Here is an example ,
< function = example_function_name > { " example_name " : " example_value " } < / function >
Reminder :
- Function calls MUST follow the specified format
- Required parameters MUST be specified
- Only call one function at a time
- Put the entire function call reply on one line
- Always add your sources when using search results to answer the user query
You are a helpful assistant .
2025-05-14 18:45:24 -04:00
2025-05-08 01:34:02 -04:00
Given the previous instructions , what is the weather in New York City ? \
Make the response as short as possible .
2025-04-26 10:06:37 -04:00
"""
2025-07-10 14:30:26 -05:00
# Change this once other backends support structural_tag
outputs = llm . generate ( prompt , sampling_params = sampling_params , use_tqdm = True )
assert outputs is not None
2025-04-26 10:06:37 -04:00
2025-07-10 14:30:26 -05:00
for output in outputs :
assert output is not None
assert isinstance ( output , RequestOutput )
generated_text = output . outputs [ 0 ] . text
assert generated_text is not None
2025-04-26 10:06:37 -04:00
2025-07-10 14:30:26 -05:00
# Search for function call pattern in the response
function_call_pattern = r " <function=get_weather>(.*?)</function> "
matches = re . findall ( function_call_pattern , generated_text )
if not matches :
print (
f " Warning: No function calls found in response: { generated_text !r} "
)
continue
# Take the first function call if multiple are found
json_str = matches [ 0 ]
try :
json_content = json . loads ( json_str )
assert " city " in json_content
assert isinstance ( json_content [ " city " ] , str )
print ( f " Found valid function call: { generated_text !r} " )
except ( json . JSONDecodeError , AssertionError ) as e :
pytest . fail (
f " Invalid function call format: { generated_text !r} \n Error: { str ( e ) } "
)
2025-04-26 10:06:37 -04:00
2025-03-29 00:10:45 -04:00
2025-05-14 18:45:24 -04:00
@pytest.mark.parametrize (
2025-12-25 15:01:02 -08:00
" model_name, backend, tokenizer_mode, reasoning_parser, speculative_config, async_scheduling " , # noqa: E501
2025-05-14 18:45:24 -04:00
[
(
" deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B " ,
" xgrammar " ,
" auto " ,
" deepseek_r1 " ,
NGRAM_SPEC_CONFIG ,
2025-12-25 15:01:02 -08:00
False ,
2025-05-14 18:45:24 -04:00
) ,
2025-12-25 15:01:02 -08:00
( " Qwen/Qwen3-1.7B " , " xgrammar " , " auto " , " deepseek_r1 " , None , False ) ,
( " Qwen/Qwen3-1.7B " , " xgrammar " , " auto " , " deepseek_r1 " , None , True ) ,
2025-05-14 18:45:24 -04:00
] ,
)
def test_structured_output_with_reasoning_matrices (
2025-09-18 05:20:27 -04:00
backend : str ,
2025-12-01 19:34:58 +08:00
tokenizer_mode : str ,
2025-05-14 18:45:24 -04:00
reasoning_parser : str ,
model_name : str ,
speculative_config : dict [ str , Any ] | None ,
2025-12-25 15:01:02 -08:00
async_scheduling : bool ,
2025-05-14 18:45:24 -04:00
) :
if current_platform . is_tpu ( ) and speculative_config :
pytest . skip ( " TPU does not support speculative decoding " )
# Use a single LLM instance for several scenarios to
# speed up the test suite.
llm = LLM (
model = model_name ,
# Don't use eager execution on TPUs because we want to test for no
# recompilation at runtime
enforce_eager = bool ( not current_platform . is_tpu ( ) ) ,
max_model_len = 1024 ,
max_num_seqs = 16 ,
2025-09-18 05:20:27 -04:00
structured_outputs_config = dict (
backend = backend ,
disable_any_whitespace = backend in { " xgrammar " , " guidance " } ,
reasoning_parser = reasoning_parser ,
) ,
2025-05-14 18:45:24 -04:00
tokenizer_mode = tokenizer_mode ,
speculative_config = speculative_config ,
2025-12-25 15:01:02 -08:00
async_scheduling = async_scheduling ,
2025-05-14 18:45:24 -04:00
)
2025-09-17 01:42:59 -07:00
tokenizer = llm . get_tokenizer ( )
2025-05-14 18:45:24 -04:00
reasoner = ReasoningParserManager . get_reasoning_parser ( reasoning_parser ) (
tokenizer = tokenizer
)
reasoning_prompt = " Solve the following math problem step-by-step, then provide the final answer as JSON object with a single key ' result ' . Make sure to correct your reasoning if there are any issue should it arise. \n Problem: What is 5 * 8 + 2? " # noqa: E501
reasoning_schema = {
" type " : " object " ,
" properties " : { " result " : { " type " : " integer " } } ,
" required " : [ " result " ] ,
" additionalProperties " : False ,
}
if " Qwen3 " in model_name :
reasoning_prompt + = " <think> \n "
sampling_params = SamplingParams (
temperature = 0.1 ,
max_tokens = 8192 ,
2025-09-18 05:20:27 -04:00
structured_outputs = StructuredOutputsParams ( json = reasoning_schema ) ,
2025-05-14 18:45:24 -04:00
)
outputs = llm . generate (
[ reasoning_prompt ] ,
sampling_params = sampling_params ,
use_tqdm = True ,
)
assert outputs is not None
output = outputs [ 0 ]
assert output is not None and isinstance ( output , RequestOutput )
prompt = output . prompt
generated_text = output . outputs [ 0 ] . text
2025-11-08 04:15:08 -08:00
reasoning , content = run_reasoning_extraction ( reasoner , [ generated_text ] )
print ( f " Prompt: { prompt !r} \n Reasoning: { reasoning !r} \n Content: { content !r} " )
2025-05-14 18:45:24 -04:00
2025-11-14 00:41:29 -06:00
if " Qwen3 " in model_name :
assert content is not None
assert reasoning is not None
if content is not None :
output_json = json . loads ( content )
jsonschema . validate ( instance = output_json , schema = reasoning_schema )
2025-05-14 18:45:24 -04:00
2025-03-30 05:20:19 +02:00
@pytest.mark.parametrize ( " model_name, tokenizer_mode " , PARAMS_MODELS_TOKENIZER_MODE )
2025-03-29 00:10:45 -04:00
def test_structured_output_auto_mode (
model_name : str ,
2025-03-30 05:20:19 +02:00
tokenizer_mode : str ,
2025-03-29 00:10:45 -04:00
) :
2026-03-20 05:50:34 -04:00
unsupported_json_schema = UNSUPPORTED_JSON_SCHEMA
2025-03-29 00:10:45 -04:00
llm = LLM (
model = model_name ,
max_model_len = 1024 ,
2025-09-18 05:20:27 -04:00
structured_outputs_config = dict ( backend = " auto " ) ,
2025-03-30 05:20:19 +02:00
tokenizer_mode = tokenizer_mode ,
2025-11-21 22:58:59 +01:00
load_format = " auto " ,
config_format = " auto " ,
2025-03-30 05:20:19 +02:00
)
2025-03-29 00:10:45 -04:00
sampling_params = SamplingParams (
temperature = 1.0 ,
max_tokens = 1000 ,
2025-09-18 05:20:27 -04:00
structured_outputs = StructuredOutputsParams ( json = unsupported_json_schema ) ,
)
2025-03-29 00:10:45 -04:00
2025-05-08 01:34:02 -04:00
prompts = (
" Give an example JSON object for a grade "
" that fits this schema: "
f " { unsupported_json_schema } . Make the response as short as possible. "
)
2025-03-29 00:10:45 -04:00
# This would fail with the default of "xgrammar", but in "auto"
# we will handle fallback automatically.
outputs = llm . generate ( prompts , sampling_params = sampling_params , use_tqdm = True )
2025-04-22 14:02:20 +08:00
# Make sure `auto` backend handling doesn't mess up sampling_params
# and that we can reuse it without error.
outputs . extend (
2025-08-22 10:56:57 +08:00
llm . generate ( prompts , sampling_params = sampling_params , use_tqdm = True )
2025-10-05 15:06:22 +01:00
)
2025-04-22 14:02:20 +08:00
2025-03-29 00:10:45 -04:00
assert outputs is not None
for output in outputs :
assert output is not None
assert isinstance ( output , RequestOutput )
generated_text = output . outputs [ 0 ] . text
assert generated_text is not None
print ( generated_text )
# Parse to verify it is valid JSON
parsed_json = json . loads ( generated_text )
assert isinstance ( parsed_json , dict )
2025-04-23 12:34:41 -06:00
2025-10-07 23:42:31 +08:00
def test_guidance_no_additional_properties ( ) :
2025-04-23 12:34:41 -06:00
llm = LLM (
model = " Qwen/Qwen2.5-1.5B-Instruct " ,
max_model_len = 1024 ,
2025-09-18 05:20:27 -04:00
structured_outputs_config = dict (
backend = " guidance " ,
disable_any_whitespace = True ,
disable_additional_properties = True ,
2025-10-05 15:06:22 +01:00
) ,
2025-09-18 05:20:27 -04:00
)
2025-04-23 12:34:41 -06:00
schema = {
" type " : " object " ,
" properties " : {
" a1 " : { " type " : " string " } ,
" a2 " : { " type " : " string " } ,
" a3 " : { " type " : " string " } ,
} ,
" required " : [ " a1 " , " a2 " , " a3 " ] ,
}
prompt = (
" <|im_start|>system \n You are Qwen, created by Alibaba Cloud. You are a "
" helpful assistant.<|im_end|> \n <|im_start|>user \n Please generate a "
2025-05-08 01:34:02 -04:00
" large JSON object with key-value pairs a1=b1, a2=b2, ..., a20=b20. "
" Make the response as short as possible. "
2025-04-23 12:34:41 -06:00
" <|im_end|> \n <|im_start|>assistant \n "
)
def generate_with_backend ( backend ) :
2025-09-18 05:20:27 -04:00
structured_outputs_params = StructuredOutputsParams (
2025-04-29 20:02:23 +01:00
json = schema ,
backend = backend ,
disable_any_whitespace = True ,
disable_additional_properties = True ,
)
2025-09-18 05:20:27 -04:00
sampling_params = SamplingParams (
temperature = 0 , max_tokens = 256 , structured_outputs = structured_outputs_params
)
2025-04-23 12:34:41 -06:00
2025-08-22 10:56:57 +08:00
outputs = llm . generate ( prompt , sampling_params = sampling_params )
2025-04-23 12:34:41 -06:00
assert outputs is not None
generated_text = outputs [ 0 ] . outputs [ 0 ] . text
assert generated_text is not None
parsed_json = json . loads ( generated_text )
assert isinstance ( parsed_json , dict )
jsonschema . validate ( instance = parsed_json , schema = schema )
return parsed_json
2025-04-29 20:02:23 +01:00
generated = generate_with_backend ( " guidance " )
2025-04-23 12:34:41 -06:00
assert " a1 " in generated
assert " a2 " in generated
assert " a3 " in generated
assert " a4 " not in generated
assert " a5 " not in generated
assert " a6 " not in generated
2025-08-22 10:31:24 -07:00
2025-09-18 05:20:27 -04:00
@pytest.mark.parametrize ( " backend " , [ " guidance " , " xgrammar " , " outlines " ] )
def test_structured_output_batched_with_non_structured_outputs_requests (
backend : str ,
2025-08-22 10:31:24 -07:00
) :
2026-03-20 05:50:34 -04:00
sample_json_schema = SAMPLE_JSON_SCHEMA
2025-08-22 10:31:24 -07:00
# Don't use eager execution on TPUs because we want to test for no
# recompilation at runtime
enforce_eager = bool ( not current_platform . is_tpu ( ) )
llm = LLM (
model = " meta-llama/Meta-Llama-3.1-8B-Instruct " ,
enforce_eager = enforce_eager ,
max_model_len = 1024 ,
2025-09-18 05:20:27 -04:00
structured_outputs_config = StructuredOutputsConfig (
backend = backend ,
disable_any_whitespace = backend in { " xgrammar " , " guidance " } ,
) ,
2025-08-22 10:31:24 -07:00
)
2025-09-18 05:20:27 -04:00
structured_outputs_prompt = (
2025-08-22 10:31:24 -07:00
" Give an example JSON for an employee profile that fits this "
" schema. Make the response as short as possible. Schema: "
f " { sample_json_schema } "
)
2025-09-18 05:20:27 -04:00
non_structured_outputs_prompt = " The diameter of the Earth in kilometers is "
2025-08-22 10:31:24 -07:00
2025-09-18 05:20:27 -04:00
prompts = [ structured_outputs_prompt , non_structured_outputs_prompt ]
2025-08-22 10:31:24 -07:00
sampling_params = [
2025-09-18 05:20:27 -04:00
SamplingParams (
temperature = 1.0 ,
max_tokens = 400 ,
structured_outputs = StructuredOutputsParams ( json = sample_json_schema ) ,
) ,
2025-08-22 10:31:24 -07:00
# No max tokens, temp=0 to assert on contents
SamplingParams (
seed = 42 ,
temperature = 0 ,
top_p = 1.0 ,
) ,
]
outputs = llm . generate (
prompts = prompts , sampling_params = sampling_params , use_tqdm = True
)
assert outputs is not None
# Free memory as soon as possible as failed assertions
# will short circuit and not free up memory
del llm
2026-03-04 17:49:47 +08:00
torch . accelerator . empty_cache ( )
2025-08-22 10:31:24 -07:00
cleanup_dist_env_and_memory ( )
for index , output in enumerate ( outputs ) :
assert output is not None
assert isinstance ( output , RequestOutput )
prompt = output . prompt
generated_text = output . outputs [ 0 ] . text
assert generated_text is not None
print ( f " Prompt: \n { prompt !r} \n Generated text: \n { generated_text !r} " )
if index == 0 :
2025-09-18 05:20:27 -04:00
# First prompt is structured outputs, expect valid JSON
2025-08-22 10:31:24 -07:00
assert " \n " not in generated_text
output_json = json . loads ( generated_text )
jsonschema . validate ( instance = output_json , schema = sample_json_schema )
else :
2025-09-18 05:20:27 -04:00
# Second prompt is not structured outputs, expect valid output
2025-08-22 10:31:24 -07:00
# Cannot assert on exact output, but we can expect it to be factual
assert " 12,742 " in generated_text
2025-09-18 05:20:27 -04:00
# non-structured outputs requests should not return a valid JSON here
2025-08-22 10:31:24 -07:00
with pytest . raises ( ValueError ) :
output_json = json . loads ( generated_text )
2025-10-17 21:55:54 -07:00
2025-11-25 05:24:05 +00:00
@pytest.mark.parametrize ( " backend " , [ " xgrammar " ] )
def test_structured_output_with_structural_tag ( backend : str ) :
2025-10-17 21:55:54 -07:00
llm = LLM (
model = " Qwen/Qwen2.5-1.5B-Instruct " ,
2025-11-25 05:24:05 +00:00
structured_outputs_config = StructuredOutputsConfig ( backend = backend ) ,
2025-10-17 21:55:54 -07:00
)
structural_tag_config = {
" type " : " structural_tag " ,
" format " : {
" type " : " triggered_tags " ,
" tags " : [
{ " begin " : " hello_flag " , " content " : { " type " : " any_text " } , " end " : " hello " }
] ,
" triggers " : [ " hello " ] ,
" stop_after_first " : False ,
} ,
}
sampling_params = SamplingParams (
temperature = 0.0 ,
max_tokens = 500 ,
2025-11-25 05:24:05 +00:00
structured_outputs = StructuredOutputsParams (
2025-10-17 21:55:54 -07:00
structural_tag = json . dumps ( structural_tag_config )
) ,
)
2026-03-06 01:05:46 +08:00
prompt = " Hello and repeat hello 10 times, do not say anything else. Only say hello hello hello, now start "
2025-10-17 21:55:54 -07:00
outputs = llm . generate ( prompt , sampling_params = sampling_params , use_tqdm = True )
assert outputs is not None
for output in outputs :
assert output is not None
assert isinstance ( output , RequestOutput )
prompt = output . prompt
generated_text = output . outputs [ 0 ] . text
assert generated_text is not None
assert " hello_flag " in generated_text , (
f " Expected ' hello_flag ' to be in generated text, but got: { generated_text } "
)