diff --git a/examples/offline_inference/prompt_embed_inference.py b/examples/offline_inference/prompt_embed_inference.py index 5d79222a1..a0eaeb681 100644 --- a/examples/offline_inference/prompt_embed_inference.py +++ b/examples/offline_inference/prompt_embed_inference.py @@ -38,8 +38,8 @@ def get_prompt_embeds( embedding_layer: torch.nn.Module, ): token_ids = tokenizer.apply_chat_template( - chat, add_generation_prompt=True, return_tensors="pt" - ) + chat, add_generation_prompt=True, return_tensors="pt", return_dict=True + ).input_ids prompt_embeds = embedding_layer(token_ids).squeeze(0) return prompt_embeds diff --git a/examples/online_serving/prompt_embed_inference_with_openai_client.py b/examples/online_serving/prompt_embed_inference_with_openai_client.py index 889be6820..fab5dee35 100644 --- a/examples/online_serving/prompt_embed_inference_with_openai_client.py +++ b/examples/online_serving/prompt_embed_inference_with_openai_client.py @@ -49,8 +49,8 @@ def main(): # Refer to the HuggingFace repo for the correct format to use chat = [{"role": "user", "content": "Please tell me about the capital of France."}] token_ids = tokenizer.apply_chat_template( - chat, add_generation_prompt=True, return_tensors="pt" - ) + chat, add_generation_prompt=True, return_tensors="pt", return_dict=True + ).input_ids embedding_layer = transformers_model.get_input_embeddings() prompt_embeds = embedding_layer(token_ids).squeeze(0) diff --git a/examples/online_serving/token_generation_client.py b/examples/online_serving/token_generation_client.py index 88ee43c5d..836f54d50 100644 --- a/examples/online_serving/token_generation_client.py +++ b/examples/online_serving/token_generation_client.py @@ -27,7 +27,8 @@ def main(client): messages, add_generation_prompt=True, enable_thinking=False, - ) + return_dict=True, + ).input_ids payload = { "model": MODEL_NAME, "token_ids": token_ids, diff --git a/tests/entrypoints/openai/test_serving_tokens.py b/tests/entrypoints/openai/test_serving_tokens.py index acbbaa659..215de8510 100644 --- a/tests/entrypoints/openai/test_serving_tokens.py +++ b/tests/entrypoints/openai/test_serving_tokens.py @@ -92,7 +92,8 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages): messages, add_generation_prompt=True, enable_thinking=False, # default with Qwen3 - ) + return_dict=True, # default with Transformers v5 + ).input_ids for ignore_eos in [True, False]: payload = { @@ -155,7 +156,8 @@ async def test_stop_string_workflow(client, tokenizer, messages): messages, add_generation_prompt=True, enable_thinking=False, # default with Qwen3 - ) + return_dict=True, # default with Transformers v5 + ).input_ids payload = { "model": MODEL_NAME, "token_ids": token_ids, @@ -251,7 +253,8 @@ async def test_generate_with_lora_adapter(client, tokenizer, messages): messages, add_generation_prompt=True, enable_thinking=False, # default with Qwen3 - ) + return_dict=True, # default with Transformers v5 + ).input_ids payload = { "model": "Alice", "token_ids": token_ids, diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py index c0e4a1932..f4a68f8f1 100644 --- a/vllm/model_executor/models/isaac.py +++ b/vllm/model_executor/models/isaac.py @@ -759,6 +759,7 @@ class IsaacProcessor: # Regular text message processed_messages.append(message) + kwargs["return_dict"] = False return self.tokenizer.apply_chat_template( processed_messages, tokenize=tokenize, diff --git a/vllm/renderers/deepseek_v32.py b/vllm/renderers/deepseek_v32.py index 123911654..381618f62 100644 --- a/vllm/renderers/deepseek_v32.py +++ b/vllm/renderers/deepseek_v32.py @@ -70,6 +70,7 @@ class DeepseekV32Renderer(RendererLike): content_format="string", ) + kwargs["return_dict"] = False prompt_raw = tokenizer.apply_chat_template( conversation=conversation, messages=messages, @@ -100,6 +101,7 @@ class DeepseekV32Renderer(RendererLike): content_format="string", ) + kwargs["return_dict"] = False prompt_raw = tokenizer.apply_chat_template( conversation=conversation, messages=messages, diff --git a/vllm/renderers/grok2.py b/vllm/renderers/grok2.py index 06de760f8..05709409c 100644 --- a/vllm/renderers/grok2.py +++ b/vllm/renderers/grok2.py @@ -70,6 +70,7 @@ class Grok2Renderer(RendererLike): content_format="string", ) + kwargs["return_dict"] = False prompt_raw = tokenizer.apply_chat_template( conversation=conversation, messages=messages, @@ -100,6 +101,7 @@ class Grok2Renderer(RendererLike): content_format="string", ) + kwargs["return_dict"] = False prompt_raw = tokenizer.apply_chat_template( conversation=conversation, messages=messages, diff --git a/vllm/renderers/hf.py b/vllm/renderers/hf.py index 252e6e753..c5a485ec1 100644 --- a/vllm/renderers/hf.py +++ b/vllm/renderers/hf.py @@ -465,6 +465,7 @@ def safe_apply_chat_template( chat_template=chat_template, chat_template_kwargs=kwargs, ) + resolved_kwargs["return_dict"] = False try: return tokenizer.apply_chat_template( diff --git a/vllm/tokenizers/grok2.py b/vllm/tokenizers/grok2.py index a4071908d..fe00f5e56 100644 --- a/vllm/tokenizers/grok2.py +++ b/vllm/tokenizers/grok2.py @@ -432,6 +432,7 @@ class Grok2Tokenizer(TokenizerLike): raise ValueError( "No chat template available. Provide `chat_template` explicitly." ) + kwargs["return_dict"] = False prompt = hf_chat_utils.apply_chat_template( conversation=messages, chat_template=template, diff --git a/vllm/transformers_utils/processors/hunyuan_vl.py b/vllm/transformers_utils/processors/hunyuan_vl.py index f32ce115c..528775777 100644 --- a/vllm/transformers_utils/processors/hunyuan_vl.py +++ b/vllm/transformers_utils/processors/hunyuan_vl.py @@ -148,8 +148,8 @@ class HunYuanVLProcessor(ProcessorMixin): assert 0 def apply_chat_template(self, *args, **kwargs): - token_ids = self.tokenizer.apply_chat_template(*args, **kwargs) - return token_ids + kwargs["return_dict"] = False + return self.tokenizer.apply_chat_template(*args, **kwargs) def get_imgs_pos(self, doc_ids): doc_ids = np.array(doc_ids, dtype=np.int64) diff --git a/vllm/transformers_utils/processors/qwen3_asr.py b/vllm/transformers_utils/processors/qwen3_asr.py index 7fb30f8bb..677326e25 100644 --- a/vllm/transformers_utils/processors/qwen3_asr.py +++ b/vllm/transformers_utils/processors/qwen3_asr.py @@ -213,6 +213,7 @@ class Qwen3ASRProcessor(ProcessorMixin): return list(_iter()) def apply_chat_template(self, conversations, chat_template=None, **kwargs): + kwargs["return_dict"] = False return super().apply_chat_template(conversations, chat_template, **kwargs) @property