# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest from vllm.config import ModelConfig from vllm.entrypoints.chat_utils import load_chat_template from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest from vllm.renderers.hf import ( _get_hf_base_chat_template_params, _try_extract_ast, resolve_chat_template, resolve_chat_template_content_format, resolve_chat_template_kwargs, safe_apply_chat_template, ) from vllm.tokenizers import get_tokenizer from ..models.registry import HF_EXAMPLE_MODELS from ..utils import VLLM_PATH EXAMPLES_DIR = VLLM_PATH / "examples" chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja" assert chatml_jinja_path.exists() # Define models, templates, and their corresponding expected outputs MODEL_TEMPLATE_GENERATION_OUTPUT = [ ( "facebook/opt-125m", chatml_jinja_path, True, False, """<|im_start|>user Hello<|im_end|> <|im_start|>assistant Hi there!<|im_end|> <|im_start|>user What is the capital of<|im_end|> <|im_start|>assistant """, ), ( "facebook/opt-125m", chatml_jinja_path, False, False, """<|im_start|>user Hello<|im_end|> <|im_start|>assistant Hi there!<|im_end|> <|im_start|>user What is the capital of""", ), ( "facebook/opt-125m", chatml_jinja_path, False, True, """<|im_start|>user Hello<|im_end|> <|im_start|>assistant Hi there!<|im_end|> <|im_start|>user What is the capital of<|im_end|> <|im_start|>assistant The capital of""", ), ] TEST_MESSAGES = [ {"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi there!"}, {"role": "user", "content": "What is the capital of"}, ] ASSISTANT_MESSAGE_TO_CONTINUE = {"role": "assistant", "content": "The capital of"} def test_load_chat_template(): # Testing chatml template template_content = load_chat_template(chat_template=chatml_jinja_path) # Test assertions assert template_content is not None # Hard coded value for template_chatml.jinja assert ( template_content == """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %} {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}""" # noqa: E501 ) def test_no_load_chat_template_filelike(): # Testing chatml template template = "../../examples/does_not_exist" with pytest.raises(ValueError, match="looks like a file path"): load_chat_template(chat_template=template) def test_no_load_chat_template_literallike(): # Testing chatml template template = "{{ messages }}" template_content = load_chat_template(chat_template=template) assert template_content == template @pytest.mark.parametrize( "model", [ "Qwen/Qwen2-VL-2B-Instruct", # chat_template is of type str "NousResearch/Hermes-3-Llama-3.1-8B", # chat_template is of type dict ], ) @pytest.mark.parametrize("use_tools", [True, False]) def test_resolve_chat_template(sample_json_schema, model, use_tools): """checks that chat_template is a dict type for HF models.""" model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") model_config = ModelConfig( model, tokenizer=model_info.tokenizer or model, tokenizer_mode=model_info.tokenizer_mode, revision=model_info.revision, trust_remote_code=model_info.trust_remote_code, hf_overrides=model_info.hf_overrides, skip_tokenizer_init=model_info.require_embed_inputs, enable_prompt_embeds=model_info.require_embed_inputs, enable_mm_embeds=model_info.require_embed_inputs, enforce_eager=model_info.enforce_eager, dtype=model_info.dtype, ) # Build the tokenizer tokenizer = get_tokenizer( model, trust_remote_code=model_config.trust_remote_code, ) tools = ( [ { "type": "function", "function": { "name": "dummy_function_name", "description": "This is a dummy function", "parameters": sample_json_schema, }, } ] if use_tools else None ) # Test detecting the tokenizer's chat_template chat_template = resolve_chat_template( tokenizer, chat_template=None, tools=tools, model_config=model_config, ) assert isinstance(chat_template, str) @pytest.mark.parametrize( "model, expected_kwargs", [ ( "Qwen/Qwen2-VL-2B-Instruct", { "add_vision_id", "add_generation_prompt", "continue_final_message", "tools", }, ), ( "Qwen/Qwen3-8B", { "enable_thinking", "add_generation_prompt", "continue_final_message", "tools", }, ), ], ) def test_resolve_chat_template_kwargs(sample_json_schema, model, expected_kwargs): """checks that chat_template is a dict type for HF models.""" model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") tools = [ { "type": "function", "function": { "name": "dummy_function_name", "description": "This is a dummy function", "parameters": sample_json_schema, }, } ] chat_template_kwargs = { # both unused "unsed_kwargs_1": 123, "unsed_kwargs_2": "abc", # should not appear "chat_template": "{% Hello world! %}", "tokenize": True, # used by tokenizer "continue_final_message": True, "tools": tools, # both used by Qwen2-VL and Qwen3 "add_generation_prompt": True, # only used by Qwen2-VL "add_vision_id": True, # only used by Qwen3 "enable_thinking": True, } model_config = ModelConfig( model, tokenizer=model_info.tokenizer or model, tokenizer_mode=model_info.tokenizer_mode, revision=model_info.revision, trust_remote_code=model_info.trust_remote_code, hf_overrides=model_info.hf_overrides, skip_tokenizer_init=model_info.require_embed_inputs, enable_prompt_embeds=model_info.require_embed_inputs, enable_mm_embeds=model_info.require_embed_inputs, enforce_eager=model_info.enforce_eager, dtype=model_info.dtype, ) # Build the tokenizer tokenizer = get_tokenizer( model, trust_remote_code=model_config.trust_remote_code, ) # Test detecting the tokenizer's chat_template chat_template = resolve_chat_template( tokenizer, chat_template=None, tools=tools, model_config=model_config, ) with pytest.raises( ValueError, match="Found unexpected chat template kwargs from request" ): # should raise error if `chat_template_kwargs` contains # `chat_template` or `tokenize` resolve_chat_template_kwargs( tokenizer, chat_template=chat_template, chat_template_kwargs=chat_template_kwargs, ) resolved_chat_template_kwargs = resolve_chat_template_kwargs( tokenizer, chat_template=chat_template, chat_template_kwargs=chat_template_kwargs, raise_on_unexpected=False, ) assert set(resolved_chat_template_kwargs.keys()) == expected_kwargs # Additional test: Verify HF base parameters work with **kwargs tokenizers # This validates the fix for tokenizers like Kimi K2 that use **kwargs # to receive standard HuggingFace parameters instead of declaring them explicitly hf_base_params = _get_hf_base_chat_template_params() # Verify common HF parameters are in the base class assert {"add_generation_prompt", "tools", "continue_final_message"}.issubset( hf_base_params ), f"Expected HF base params not found in {hf_base_params}" # Test with a mock tokenizer that uses **kwargs (like Kimi K2) class MockTokenizerWithKwargs: def apply_chat_template(self, conversation, **kwargs): return "mocked_output" mock_tokenizer = MockTokenizerWithKwargs() mock_kwargs = { "add_generation_prompt": True, "tools": tools, "continue_final_message": False, "unknown_param": "should_be_filtered", } resolved_mock = resolve_chat_template_kwargs( mock_tokenizer, chat_template, mock_kwargs, raise_on_unexpected=False ) # HF base params should pass through even with **kwargs tokenizer assert "add_generation_prompt" in resolved_mock assert "tools" in resolved_mock assert "continue_final_message" in resolved_mock # Unknown params should be filtered out assert "unknown_param" not in resolved_mock # NOTE: Qwen2-Audio default chat template is specially defined inside # processor class instead of using `tokenizer_config.json` @pytest.mark.parametrize( ("model", "expected_format"), [ ("microsoft/Phi-3.5-vision-instruct", "string"), ("Qwen/Qwen2-VL-2B-Instruct", "openai"), ("Qwen/Qwen2.5-VL-3B-Instruct", "openai"), ("fixie-ai/ultravox-v0_5-llama-3_2-1b", "string"), ("Qwen/Qwen2-Audio-7B-Instruct", "openai"), ("meta-llama/Llama-Guard-3-1B", "openai"), ], ) def test_resolve_content_format_hf_defined(model, expected_format): model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") model_config = ModelConfig( model, tokenizer=model_info.tokenizer or model, tokenizer_mode=model_info.tokenizer_mode, revision=model_info.revision, trust_remote_code=model_info.trust_remote_code, hf_overrides=model_info.hf_overrides, skip_tokenizer_init=model_info.require_embed_inputs, enable_prompt_embeds=model_info.require_embed_inputs, enable_mm_embeds=model_info.require_embed_inputs, enforce_eager=model_info.enforce_eager, dtype=model_info.dtype, ) tokenizer = get_tokenizer( model, trust_remote_code=model_config.trust_remote_code, ) # Test detecting the tokenizer's chat_template chat_template = resolve_chat_template( tokenizer, chat_template=None, tools=None, model_config=model_config, ) assert isinstance(chat_template, str) print("[TEXT]") print(chat_template) print("[AST]") print(_try_extract_ast(chat_template)) resolved_format = resolve_chat_template_content_format( None, # Test detecting the tokenizer's chat_template None, "auto", tokenizer, model_config=model_config, ) assert resolved_format == expected_format @pytest.mark.parametrize( ("model", "expected_format"), [ ("Salesforce/blip2-opt-2.7b", "string"), ("facebook/chameleon-7b", "string"), ("deepseek-ai/deepseek-vl2-tiny", "string"), ("adept/fuyu-8b", "string"), ("google/paligemma-3b-mix-224", "string"), ("Qwen/Qwen-VL", "string"), ("Qwen/Qwen-VL-Chat", "string"), ], ) def test_resolve_content_format_fallbacks(model, expected_format): model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") model_config = ModelConfig( model, tokenizer=model_info.tokenizer or model, tokenizer_mode=model_info.tokenizer_mode, revision=model_info.revision, trust_remote_code=model_info.trust_remote_code, hf_overrides=model_info.hf_overrides, skip_tokenizer_init=model_info.require_embed_inputs, enable_prompt_embeds=model_info.require_embed_inputs, enable_mm_embeds=model_info.require_embed_inputs, enforce_eager=model_info.enforce_eager, dtype=model_info.dtype, ) tokenizer = get_tokenizer( model_config.tokenizer, trust_remote_code=model_config.trust_remote_code, ) # Test detecting the tokenizer's chat_template chat_template = resolve_chat_template( tokenizer, chat_template=None, tools=None, model_config=model_config, ) assert isinstance(chat_template, str) print("[TEXT]") print(chat_template) print("[AST]") print(_try_extract_ast(chat_template)) resolved_format = resolve_chat_template_content_format( None, # Test detecting the tokenizer's chat_template None, "auto", tokenizer, model_config=model_config, ) assert resolved_format == expected_format @pytest.mark.parametrize( ("template_path", "expected_format"), [ ("template_alpaca.jinja", "string"), ("template_baichuan.jinja", "string"), ("template_chatglm.jinja", "string"), ("template_chatglm2.jinja", "string"), ("template_chatml.jinja", "string"), ("template_falcon_180b.jinja", "string"), ("template_falcon.jinja", "string"), ("template_inkbot.jinja", "string"), ("template_teleflm.jinja", "string"), ("pooling/embed/template/dse_qwen2_vl.jinja", "openai"), ("pooling/embed/template/vlm2vec_phi3v.jinja", "openai"), ("pooling/embed/template/vlm2vec_qwen2vl.jinja", "openai"), ("tool_chat_template_granite_20b_fc.jinja", "string"), ("tool_chat_template_hermes.jinja", "string"), ("tool_chat_template_internlm2_tool.jinja", "string"), ("tool_chat_template_llama3.1_json.jinja", "openai"), ("tool_chat_template_llama3.2_json.jinja", "openai"), ("tool_chat_template_mistral_parallel.jinja", "string"), ("tool_chat_template_mistral.jinja", "string"), ], ) def test_resolve_content_format_examples(template_path, expected_format): model = "Qwen/Qwen2-VL-2B-Instruct" # Dummy model_config = ModelConfig( model, tokenizer=model, trust_remote_code=True, ) dummy_tokenizer = get_tokenizer( model, trust_remote_code=model_config.trust_remote_code, ) dummy_tokenizer.chat_template = None chat_template = load_chat_template(EXAMPLES_DIR / template_path) assert isinstance(chat_template, str) print("[TEXT]") print(chat_template) print("[AST]") print(_try_extract_ast(chat_template)) resolved_format = resolve_chat_template_content_format( chat_template, None, "auto", dummy_tokenizer, model_config=model_config, ) assert resolved_format == expected_format @pytest.mark.parametrize( "model,template,add_generation_prompt,continue_final_message,expected_output", MODEL_TEMPLATE_GENERATION_OUTPUT, ) def test_get_gen_prompt( model, template, add_generation_prompt, continue_final_message, expected_output ): model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") model_config = ModelConfig( model, tokenizer=model_info.tokenizer or model, tokenizer_mode=model_info.tokenizer_mode, trust_remote_code=model_info.trust_remote_code, revision=model_info.revision, hf_overrides=model_info.hf_overrides, skip_tokenizer_init=model_info.require_embed_inputs, enable_prompt_embeds=model_info.require_embed_inputs, enable_mm_embeds=model_info.require_embed_inputs, enforce_eager=model_info.enforce_eager, dtype=model_info.dtype, ) # Initialize the tokenizer tokenizer = get_tokenizer( tokenizer_name=model_config.tokenizer, trust_remote_code=model_config.trust_remote_code, ) template_content = load_chat_template(chat_template=template) # Create a mock request object using keyword arguments mock_request = ChatCompletionRequest( model=model, messages=TEST_MESSAGES + [ASSISTANT_MESSAGE_TO_CONTINUE] if continue_final_message else TEST_MESSAGES, add_generation_prompt=add_generation_prompt, continue_final_message=continue_final_message, ) # Call the function and get the result result = safe_apply_chat_template( model_config, tokenizer, mock_request.messages, tools=None, chat_template=mock_request.chat_template or template_content, add_generation_prompt=mock_request.add_generation_prompt, continue_final_message=mock_request.continue_final_message, tokenize=False, ) # Test assertion assert result == expected_output, ( f"The generated prompt does not match the expected output for " f"model {model} and template {template}" )