2026-01-22 20:44:22 +08:00
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm . config import ModelConfig
from vllm . entrypoints . chat_utils import load_chat_template
from vllm . entrypoints . openai . chat_completion . protocol import ChatCompletionRequest
from vllm . renderers . hf import (
_get_hf_base_chat_template_params ,
_try_extract_ast ,
resolve_chat_template ,
resolve_chat_template_content_format ,
resolve_chat_template_kwargs ,
safe_apply_chat_template ,
)
from vllm . tokenizers import get_tokenizer
from . . models . registry import HF_EXAMPLE_MODELS
from . . utils import VLLM_PATH
EXAMPLES_DIR = VLLM_PATH / " examples "
chatml_jinja_path = VLLM_PATH / " examples/template_chatml.jinja "
assert chatml_jinja_path . exists ( )
# Define models, templates, and their corresponding expected outputs
MODEL_TEMPLATE_GENERATION_OUTPUT = [
(
" facebook/opt-125m " ,
chatml_jinja_path ,
True ,
False ,
""" <|im_start|>user
Hello < | im_end | >
< | im_start | > assistant
Hi there ! < | im_end | >
< | im_start | > user
What is the capital of < | im_end | >
< | im_start | > assistant
""" ,
) ,
(
" facebook/opt-125m " ,
chatml_jinja_path ,
False ,
False ,
""" <|im_start|>user
Hello < | im_end | >
< | im_start | > assistant
Hi there ! < | im_end | >
< | im_start | > user
What is the capital of """ ,
) ,
(
" facebook/opt-125m " ,
chatml_jinja_path ,
False ,
True ,
""" <|im_start|>user
Hello < | im_end | >
< | im_start | > assistant
Hi there ! < | im_end | >
< | im_start | > user
What is the capital of < | im_end | >
< | im_start | > assistant
The capital of """ ,
) ,
]
TEST_MESSAGES = [
{ " role " : " user " , " content " : " Hello " } ,
{ " role " : " assistant " , " content " : " Hi there! " } ,
{ " role " : " user " , " content " : " What is the capital of " } ,
]
ASSISTANT_MESSAGE_TO_CONTINUE = { " role " : " assistant " , " content " : " The capital of " }
def test_load_chat_template ( ) :
# Testing chatml template
template_content = load_chat_template ( chat_template = chatml_jinja_path )
# Test assertions
assert template_content is not None
# Hard coded value for template_chatml.jinja
assert (
template_content
== """ { % f or message in messages % } {{ ' <|im_start|> ' + message[ ' role ' ] + ' \\ n ' + message[ ' content ' ]}} { % i f (loop.last and add_generation_prompt) or not loop.last % } {{ ' <|im_end|> ' + ' \\ n ' }} { % e ndif % } { % e ndfor % }
{ % if add_generation_prompt and messages [ - 1 ] [ ' role ' ] != ' assistant ' % } { { ' <|im_start|>assistant \\ n ' } } { % endif % } """ # noqa: E501
)
def test_no_load_chat_template_filelike ( ) :
# Testing chatml template
template = " ../../examples/does_not_exist "
with pytest . raises ( ValueError , match = " looks like a file path " ) :
load_chat_template ( chat_template = template )
def test_no_load_chat_template_literallike ( ) :
# Testing chatml template
template = " {{ messages }} "
template_content = load_chat_template ( chat_template = template )
assert template_content == template
@pytest.mark.parametrize (
" model " ,
[
" Qwen/Qwen2-VL-2B-Instruct " , # chat_template is of type str
" NousResearch/Hermes-3-Llama-3.1-8B " , # chat_template is of type dict
] ,
)
@pytest.mark.parametrize ( " use_tools " , [ True , False ] )
def test_resolve_chat_template ( sample_json_schema , model , use_tools ) :
""" checks that chat_template is a dict type for HF models. """
model_info = HF_EXAMPLE_MODELS . find_hf_info ( model )
model_info . check_available_online ( on_fail = " skip " )
model_config = ModelConfig (
model ,
tokenizer = model_info . tokenizer or model ,
tokenizer_mode = model_info . tokenizer_mode ,
revision = model_info . revision ,
trust_remote_code = model_info . trust_remote_code ,
hf_overrides = model_info . hf_overrides ,
skip_tokenizer_init = model_info . require_embed_inputs ,
enable_prompt_embeds = model_info . require_embed_inputs ,
enable_mm_embeds = model_info . require_embed_inputs ,
enforce_eager = model_info . enforce_eager ,
dtype = model_info . dtype ,
)
# Build the tokenizer
tokenizer = get_tokenizer (
model ,
trust_remote_code = model_config . trust_remote_code ,
)
tools = (
[
{
" type " : " function " ,
" function " : {
" name " : " dummy_function_name " ,
" description " : " This is a dummy function " ,
" parameters " : sample_json_schema ,
} ,
}
]
if use_tools
else None
)
# Test detecting the tokenizer's chat_template
chat_template = resolve_chat_template (
tokenizer ,
chat_template = None ,
tools = tools ,
model_config = model_config ,
)
assert isinstance ( chat_template , str )
@pytest.mark.parametrize (
" model, expected_kwargs " ,
[
(
" Qwen/Qwen2-VL-2B-Instruct " ,
{
" add_vision_id " ,
" add_generation_prompt " ,
" continue_final_message " ,
" tools " ,
} ,
) ,
(
" Qwen/Qwen3-8B " ,
{
" enable_thinking " ,
" add_generation_prompt " ,
" continue_final_message " ,
" tools " ,
} ,
) ,
] ,
)
def test_resolve_chat_template_kwargs ( sample_json_schema , model , expected_kwargs ) :
""" checks that chat_template is a dict type for HF models. """
model_info = HF_EXAMPLE_MODELS . find_hf_info ( model )
model_info . check_available_online ( on_fail = " skip " )
tools = [
{
" type " : " function " ,
" function " : {
" name " : " dummy_function_name " ,
" description " : " This is a dummy function " ,
" parameters " : sample_json_schema ,
} ,
}
]
chat_template_kwargs = {
# both unused
" unsed_kwargs_1 " : 123 ,
" unsed_kwargs_2 " : " abc " ,
# should not appear
" chat_template " : " { % Hello world! % } " ,
" tokenize " : True ,
# used by tokenizer
" continue_final_message " : True ,
" tools " : tools ,
# both used by Qwen2-VL and Qwen3
" add_generation_prompt " : True ,
# only used by Qwen2-VL
" add_vision_id " : True ,
# only used by Qwen3
" enable_thinking " : True ,
}
model_config = ModelConfig (
model ,
tokenizer = model_info . tokenizer or model ,
tokenizer_mode = model_info . tokenizer_mode ,
revision = model_info . revision ,
trust_remote_code = model_info . trust_remote_code ,
hf_overrides = model_info . hf_overrides ,
skip_tokenizer_init = model_info . require_embed_inputs ,
enable_prompt_embeds = model_info . require_embed_inputs ,
enable_mm_embeds = model_info . require_embed_inputs ,
enforce_eager = model_info . enforce_eager ,
dtype = model_info . dtype ,
)
# Build the tokenizer
tokenizer = get_tokenizer (
model ,
trust_remote_code = model_config . trust_remote_code ,
)
# Test detecting the tokenizer's chat_template
chat_template = resolve_chat_template (
tokenizer ,
chat_template = None ,
tools = tools ,
model_config = model_config ,
)
with pytest . raises (
ValueError , match = " Found unexpected chat template kwargs from request "
) :
# should raise error if `chat_template_kwargs` contains
# `chat_template` or `tokenize`
resolve_chat_template_kwargs (
tokenizer ,
chat_template = chat_template ,
chat_template_kwargs = chat_template_kwargs ,
)
resolved_chat_template_kwargs = resolve_chat_template_kwargs (
tokenizer ,
chat_template = chat_template ,
chat_template_kwargs = chat_template_kwargs ,
raise_on_unexpected = False ,
)
assert set ( resolved_chat_template_kwargs . keys ( ) ) == expected_kwargs
# Additional test: Verify HF base parameters work with **kwargs tokenizers
# This validates the fix for tokenizers like Kimi K2 that use **kwargs
# to receive standard HuggingFace parameters instead of declaring them explicitly
hf_base_params = _get_hf_base_chat_template_params ( )
# Verify common HF parameters are in the base class
assert { " add_generation_prompt " , " tools " , " continue_final_message " } . issubset (
hf_base_params
) , f " Expected HF base params not found in { hf_base_params } "
# Test with a mock tokenizer that uses **kwargs (like Kimi K2)
class MockTokenizerWithKwargs :
def apply_chat_template ( self , conversation , * * kwargs ) :
return " mocked_output "
mock_tokenizer = MockTokenizerWithKwargs ( )
mock_kwargs = {
" add_generation_prompt " : True ,
" tools " : tools ,
" continue_final_message " : False ,
" unknown_param " : " should_be_filtered " ,
}
resolved_mock = resolve_chat_template_kwargs (
mock_tokenizer , chat_template , mock_kwargs , raise_on_unexpected = False
)
# HF base params should pass through even with **kwargs tokenizer
assert " add_generation_prompt " in resolved_mock
assert " tools " in resolved_mock
assert " continue_final_message " in resolved_mock
# Unknown params should be filtered out
assert " unknown_param " not in resolved_mock
# NOTE: Qwen2-Audio default chat template is specially defined inside
# processor class instead of using `tokenizer_config.json`
@pytest.mark.parametrize (
( " model " , " expected_format " ) ,
[
( " microsoft/Phi-3.5-vision-instruct " , " string " ) ,
( " Qwen/Qwen2-VL-2B-Instruct " , " openai " ) ,
( " Qwen/Qwen2.5-VL-3B-Instruct " , " openai " ) ,
( " fixie-ai/ultravox-v0_5-llama-3_2-1b " , " string " ) ,
( " Qwen/Qwen2-Audio-7B-Instruct " , " openai " ) ,
( " meta-llama/Llama-Guard-3-1B " , " openai " ) ,
] ,
)
def test_resolve_content_format_hf_defined ( model , expected_format ) :
model_info = HF_EXAMPLE_MODELS . find_hf_info ( model )
model_info . check_available_online ( on_fail = " skip " )
model_config = ModelConfig (
model ,
tokenizer = model_info . tokenizer or model ,
tokenizer_mode = model_info . tokenizer_mode ,
revision = model_info . revision ,
trust_remote_code = model_info . trust_remote_code ,
hf_overrides = model_info . hf_overrides ,
skip_tokenizer_init = model_info . require_embed_inputs ,
enable_prompt_embeds = model_info . require_embed_inputs ,
enable_mm_embeds = model_info . require_embed_inputs ,
enforce_eager = model_info . enforce_eager ,
dtype = model_info . dtype ,
)
tokenizer = get_tokenizer (
model ,
trust_remote_code = model_config . trust_remote_code ,
)
# Test detecting the tokenizer's chat_template
chat_template = resolve_chat_template (
tokenizer ,
chat_template = None ,
tools = None ,
model_config = model_config ,
)
assert isinstance ( chat_template , str )
print ( " [TEXT] " )
print ( chat_template )
print ( " [AST] " )
print ( _try_extract_ast ( chat_template ) )
resolved_format = resolve_chat_template_content_format (
None , # Test detecting the tokenizer's chat_template
None ,
" auto " ,
tokenizer ,
model_config = model_config ,
)
assert resolved_format == expected_format
@pytest.mark.parametrize (
( " model " , " expected_format " ) ,
[
( " Salesforce/blip2-opt-2.7b " , " string " ) ,
( " facebook/chameleon-7b " , " string " ) ,
( " deepseek-ai/deepseek-vl2-tiny " , " string " ) ,
( " adept/fuyu-8b " , " string " ) ,
( " google/paligemma-3b-mix-224 " , " string " ) ,
( " Qwen/Qwen-VL " , " string " ) ,
( " Qwen/Qwen-VL-Chat " , " string " ) ,
] ,
)
def test_resolve_content_format_fallbacks ( model , expected_format ) :
model_info = HF_EXAMPLE_MODELS . find_hf_info ( model )
model_info . check_available_online ( on_fail = " skip " )
model_config = ModelConfig (
model ,
tokenizer = model_info . tokenizer or model ,
tokenizer_mode = model_info . tokenizer_mode ,
revision = model_info . revision ,
trust_remote_code = model_info . trust_remote_code ,
hf_overrides = model_info . hf_overrides ,
skip_tokenizer_init = model_info . require_embed_inputs ,
enable_prompt_embeds = model_info . require_embed_inputs ,
enable_mm_embeds = model_info . require_embed_inputs ,
enforce_eager = model_info . enforce_eager ,
dtype = model_info . dtype ,
)
tokenizer = get_tokenizer (
model_config . tokenizer ,
trust_remote_code = model_config . trust_remote_code ,
)
# Test detecting the tokenizer's chat_template
chat_template = resolve_chat_template (
tokenizer ,
chat_template = None ,
tools = None ,
model_config = model_config ,
)
assert isinstance ( chat_template , str )
print ( " [TEXT] " )
print ( chat_template )
print ( " [AST] " )
print ( _try_extract_ast ( chat_template ) )
resolved_format = resolve_chat_template_content_format (
None , # Test detecting the tokenizer's chat_template
None ,
" auto " ,
tokenizer ,
model_config = model_config ,
)
assert resolved_format == expected_format
@pytest.mark.parametrize (
( " template_path " , " expected_format " ) ,
[
( " template_alpaca.jinja " , " string " ) ,
( " template_baichuan.jinja " , " string " ) ,
( " template_chatglm.jinja " , " string " ) ,
( " template_chatglm2.jinja " , " string " ) ,
( " template_chatml.jinja " , " string " ) ,
( " template_falcon_180b.jinja " , " string " ) ,
( " template_falcon.jinja " , " string " ) ,
( " template_inkbot.jinja " , " string " ) ,
( " template_teleflm.jinja " , " string " ) ,
2026-02-09 14:42:38 +08:00
( " pooling/embed/template/dse_qwen2_vl.jinja " , " openai " ) ,
( " pooling/embed/template/vlm2vec_phi3v.jinja " , " openai " ) ,
( " pooling/embed/template/vlm2vec_qwen2vl.jinja " , " openai " ) ,
2026-01-22 20:44:22 +08:00
( " tool_chat_template_granite_20b_fc.jinja " , " string " ) ,
( " tool_chat_template_hermes.jinja " , " string " ) ,
( " tool_chat_template_internlm2_tool.jinja " , " string " ) ,
( " tool_chat_template_llama3.1_json.jinja " , " openai " ) ,
( " tool_chat_template_llama3.2_json.jinja " , " openai " ) ,
( " tool_chat_template_mistral_parallel.jinja " , " string " ) ,
( " tool_chat_template_mistral.jinja " , " string " ) ,
] ,
)
def test_resolve_content_format_examples ( template_path , expected_format ) :
model = " Qwen/Qwen2-VL-2B-Instruct " # Dummy
model_config = ModelConfig (
model ,
tokenizer = model ,
trust_remote_code = True ,
)
dummy_tokenizer = get_tokenizer (
model ,
trust_remote_code = model_config . trust_remote_code ,
)
dummy_tokenizer . chat_template = None
chat_template = load_chat_template ( EXAMPLES_DIR / template_path )
assert isinstance ( chat_template , str )
print ( " [TEXT] " )
print ( chat_template )
print ( " [AST] " )
print ( _try_extract_ast ( chat_template ) )
resolved_format = resolve_chat_template_content_format (
chat_template ,
None ,
" auto " ,
dummy_tokenizer ,
model_config = model_config ,
)
assert resolved_format == expected_format
@pytest.mark.parametrize (
" model,template,add_generation_prompt,continue_final_message,expected_output " ,
MODEL_TEMPLATE_GENERATION_OUTPUT ,
)
def test_get_gen_prompt (
model , template , add_generation_prompt , continue_final_message , expected_output
) :
model_info = HF_EXAMPLE_MODELS . find_hf_info ( model )
model_info . check_available_online ( on_fail = " skip " )
model_config = ModelConfig (
model ,
tokenizer = model_info . tokenizer or model ,
tokenizer_mode = model_info . tokenizer_mode ,
trust_remote_code = model_info . trust_remote_code ,
revision = model_info . revision ,
hf_overrides = model_info . hf_overrides ,
skip_tokenizer_init = model_info . require_embed_inputs ,
enable_prompt_embeds = model_info . require_embed_inputs ,
enable_mm_embeds = model_info . require_embed_inputs ,
enforce_eager = model_info . enforce_eager ,
dtype = model_info . dtype ,
)
# Initialize the tokenizer
tokenizer = get_tokenizer (
tokenizer_name = model_config . tokenizer ,
trust_remote_code = model_config . trust_remote_code ,
)
template_content = load_chat_template ( chat_template = template )
# Create a mock request object using keyword arguments
mock_request = ChatCompletionRequest (
model = model ,
messages = TEST_MESSAGES + [ ASSISTANT_MESSAGE_TO_CONTINUE ]
if continue_final_message
else TEST_MESSAGES ,
add_generation_prompt = add_generation_prompt ,
continue_final_message = continue_final_message ,
)
# Call the function and get the result
result = safe_apply_chat_template (
model_config ,
tokenizer ,
mock_request . messages ,
tools = None ,
chat_template = mock_request . chat_template or template_content ,
add_generation_prompt = mock_request . add_generation_prompt ,
continue_final_message = mock_request . continue_final_message ,
tokenize = False ,
)
# Test assertion
assert result == expected_output , (
f " The generated prompt does not match the expected output for "
f " model { model } and template { template } "
)