diff --git a/tests/models/multimodal/generation/test_phi4mm.py b/tests/models/multimodal/generation/test_phi4mm.py index 665bed512..7f1a12f04 100644 --- a/tests/models/multimodal/generation/test_phi4mm.py +++ b/tests/models/multimodal/generation/test_phi4mm.py @@ -170,8 +170,6 @@ def run_test( @pytest.mark.parametrize( "size_factors", [ - # No image - [], # Single-scale [1.0], # Single-scale, batched diff --git a/tests/models/multimodal/generation/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py index 8db69accf..6148c0bcd 100644 --- a/tests/models/multimodal/generation/test_qwen2_vl.py +++ b/tests/models/multimodal/generation/test_qwen2_vl.py @@ -375,7 +375,6 @@ def test_qwen2_vl_image_embeddings_input( @pytest.mark.parametrize( "size_factors", [ - [], # Single-scale [0.5], # Single-scale, batched diff --git a/tests/models/quantization/test_awq.py b/tests/models/quantization/test_awq.py index 70464cf7f..6b34262d3 100644 --- a/tests/models/quantization/test_awq.py +++ b/tests/models/quantization/test_awq.py @@ -100,8 +100,6 @@ def run_awq_test( @pytest.mark.parametrize( "size_factors", [ - # No image - [], # Single-scale [1.0], # Single-scale, batched diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 4a6162cd2..b9147b99c 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -73,7 +73,7 @@ from vllm.outputs import ( from vllm.platforms import current_platform from vllm.pooling_params import PoolingParams from vllm.renderers import ChatParams, TokenizeParams, merge_kwargs -from vllm.renderers.inputs import DictPrompt, SingletonDictPrompt, TokPrompt +from vllm.renderers.inputs import DictPrompt, TokPrompt from vllm.renderers.inputs.preprocess import ( conversation_to_seq, extract_prompt_components, @@ -805,7 +805,7 @@ class LLM: self, prompts: Sequence[PromptType], tokenization_kwargs: dict[str, Any] | None = None, - ) -> list[DictPrompt | TokPrompt]: + ) -> Sequence[DictPrompt | TokPrompt]: """ Convert prompt inputs from LLM APIs (other than [LLM.chat][]) into a format that can be passed to `_add_request`. @@ -819,22 +819,12 @@ class LLM: renderer = self.llm_engine.renderer model_config = self.model_config + parsed_prompts = [ + parse_model_prompt(model_config, prompt) for prompt in prompts + ] tok_params = self._get_cmpl_tok_params(tokenization_kwargs) - engine_prompts = list[DictPrompt | TokPrompt]() - for prompt in prompts: - parsed_prompt = parse_model_prompt(model_config, prompt) - in_prompt = renderer.render_prompt(parsed_prompt) - - # Some MM models have non-default `add_special_tokens` - # TODO: Move multi-modal processor into tokenization - engine_prompts.append( - in_prompt - if model_config.is_multimodal_model - else renderer.tokenize_prompt(in_prompt, tok_params) - ) - - return engine_prompts + return renderer.render_cmpl(parsed_prompts, tok_params) def _get_chat_tok_params(self, tokenization_kwargs: dict[str, Any] | None): model_config = self.model_config @@ -857,7 +847,7 @@ class LLM: tools: list[dict[str, Any]] | None = None, tokenization_kwargs: dict[str, Any] | None = None, mm_processor_kwargs: dict[str, Any] | None = None, - ) -> list[DictPrompt | TokPrompt]: + ) -> Sequence[TokPrompt]: """ Convert a list of conversations into prompts so that they can then be used as input for other LLM APIs. @@ -885,16 +875,12 @@ class LLM: ) tok_params = self._get_chat_tok_params(tokenization_kwargs) - engine_prompts = list[DictPrompt | TokPrompt]() - for conversation in conversations: - _, in_prompt = renderer.render_messages(conversation, chat_params) - if mm_processor_kwargs is not None: - target_prompt: SingletonDictPrompt = in_prompt.get( # type: ignore - "encoder_prompt", in_prompt - ) - target_prompt["mm_processor_kwargs"] = mm_processor_kwargs # type: ignore - - engine_prompts.append(renderer.tokenize_prompt(in_prompt, tok_params)) + _, engine_prompts = renderer.render_chat( + conversations, + chat_params, + tok_params, + prompt_extras={"mm_processor_kwargs": mm_processor_kwargs}, + ) return engine_prompts @@ -1743,7 +1729,7 @@ class LLM: # TODO: Remove this after deprecating `param.truncate_prompt_tokens` # Then, move the code from the `else` block to the top and let # `self._preprocess_completion` handle prompt normalization - engine_prompts = [ + engine_prompts: Sequence[DictPrompt | TokPrompt] = [ engine_prompt for prompt, param in zip(seq_prompts, seq_params) for engine_prompt in self._preprocess_completion( diff --git a/vllm/entrypoints/openai/engine/serving.py b/vllm/entrypoints/openai/engine/serving.py index 2fabc5999..5ee5b531e 100644 --- a/vllm/entrypoints/openai/engine/serving.py +++ b/vllm/entrypoints/openai/engine/serving.py @@ -106,7 +106,6 @@ from vllm.pooling_params import PoolingParams from vllm.renderers import ChatParams, TokenizeParams, merge_kwargs from vllm.renderers.inputs import TokPrompt from vllm.renderers.inputs.preprocess import ( - SingletonDictPrompt, extract_prompt_components, extract_prompt_len, parse_model_prompt, @@ -963,8 +962,6 @@ class OpenAIServing: renderer = self.renderer model_config = self.model_config - tok_params = request.build_tok_params(model_config) - prompts = list[SingletonPrompt | bytes]() if prompt_embeds is not None: # embeds take higher priority prompts.extend(prompt_to_seq(prompt_embeds)) @@ -979,22 +976,17 @@ class OpenAIServing: ) for prompt in prompts ] - in_prompts = await renderer.render_prompts_async(parsed_prompts) + tok_params = request.build_tok_params(model_config) - extra_items = { - k: v - for k in ("mm_processor_kwargs", "cache_salt") - if (v := getattr(request, k, None)) is not None - } - for in_prompt in in_prompts: - target_prompt: SingletonDictPrompt = in_prompt.get( # type: ignore - "encoder_prompt", in_prompt - ) - target_prompt.update(extra_items) # type: ignore - - engine_prompts = await renderer.tokenize_prompts_async(in_prompts, tok_params) - - return engine_prompts + return await renderer.render_cmpl_async( + parsed_prompts, + tok_params, + prompt_extras={ + k: v + for k in ("mm_processor_kwargs", "cache_salt") + if (v := getattr(request, k, None)) is not None + }, + ) async def _preprocess_chat( self, @@ -1023,21 +1015,16 @@ class OpenAIServing: default_template, default_template_content_format ).with_defaults(default_template_kwargs) - conversation, in_prompt = await renderer.render_messages_async( - messages, chat_params + (conversation,), (engine_prompt,) = await renderer.render_chat_async( + [messages], + chat_params, + tok_params, + prompt_extras={ + k: v + for k in ("mm_processor_kwargs", "cache_salt") + if (v := getattr(request, k, None)) is not None + }, ) - target_prompt: SingletonDictPrompt = in_prompt.get( # type: ignore - "encoder_prompt", in_prompt - ) - - extra_items = { - k: v - for k in ("mm_processor_kwargs", "cache_salt") - if (v := getattr(request, k, None)) is not None - } - target_prompt.update(extra_items) # type: ignore - - engine_prompt = await renderer.tokenize_prompt_async(target_prompt, tok_params) # tool parsing is done only if a tool_parser has been set and if # tool_choice is not "none" (if tool_choice is "none" but a tool_parser diff --git a/vllm/renderers/inputs/preprocess.py b/vllm/renderers/inputs/preprocess.py index eaac6aeb5..2ad38fed8 100644 --- a/vllm/renderers/inputs/preprocess.py +++ b/vllm/renderers/inputs/preprocess.py @@ -225,16 +225,20 @@ class PromptComponents(NamedTuple): embeds: "torch.Tensor | None" = None -def extract_prompt_components( - model_config: "ModelConfig", - prompt: object, -) -> PromptComponents: - target_prompt = ( +def extract_target_prompt(model_config: "ModelConfig", prompt: object): + return ( parse_enc_dec_prompt(prompt)["encoder_prompt"] if model_config.is_encoder_decoder else parse_dec_only_prompt(prompt) ) + +def extract_prompt_components( + model_config: "ModelConfig", + prompt: object, +) -> PromptComponents: + target_prompt = extract_target_prompt(model_config, prompt) + return PromptComponents( text=target_prompt.get("prompt"), token_ids=target_prompt.get("prompt_token_ids"), # type: ignore[arg-type] @@ -243,11 +247,7 @@ def extract_prompt_components( def extract_prompt_len(model_config: "ModelConfig", prompt: object): - target_prompt = ( - parse_enc_dec_prompt(prompt)["encoder_prompt"] - if model_config.is_encoder_decoder - else parse_dec_only_prompt(prompt) - ) + target_prompt = extract_target_prompt(model_config, prompt) return length_from_prompt_token_ids_or_embeds( target_prompt.get("prompt_token_ids"), # type: ignore[arg-type] diff --git a/vllm/renderers/protocol.py b/vllm/renderers/protocol.py index 5d84ac546..adf2ee552 100644 --- a/vllm/renderers/protocol.py +++ b/vllm/renderers/protocol.py @@ -16,6 +16,7 @@ from .inputs import ( EncoderDecoderTokPrompt, TokPrompt, ) +from .inputs.preprocess import extract_target_prompt from .params import ChatParams, TokenizeParams if TYPE_CHECKING: @@ -277,3 +278,109 @@ class BaseRenderer(ABC): return await asyncio.gather( *(self.tokenize_prompt_async(prompt, params) for prompt in prompts) ) + + # Step 3: Add extra keys to the prompts + def _apply_prompt_extras( + self, + prompts: Sequence[DictPrompt | TokPrompt], + prompt_extras: dict[str, Any] | None, + ): + if not prompt_extras: + return + + for prompt in prompts: + target_prompt = extract_target_prompt(self.config, prompt) + target_prompt.update(prompt_extras) # type: ignore[arg-type] + + # Top-level methods + def render_cmpl( + self, + prompts: Sequence[DictPrompt | bytes], + tok_params: TokenizeParams, + *, + prompt_extras: dict[str, Any] | None = None, + ): + dict_prompts = self.render_prompts(prompts) + + # NOTE: Some MM models have non-default `add_special_tokens` + # so we handle tokenization in multi-modal processor + if self.config.is_multimodal_model: + self._apply_prompt_extras(dict_prompts, prompt_extras) + return dict_prompts + + tok_prompts = self.tokenize_prompts(dict_prompts, tok_params) + + self._apply_prompt_extras(tok_prompts, prompt_extras) + + # TODO: Apply multi-modal processor + return tok_prompts + + async def render_cmpl_async( + self, + prompts: Sequence[DictPrompt | bytes], + tok_params: TokenizeParams, + *, + prompt_extras: dict[str, Any] | None = None, + ): + dict_prompts = await self.render_prompts_async(prompts) + + # NOTE: MM data cannot be passed to online Completions API + # so we don't have the special case that is in the offline version + tok_prompts = await self.tokenize_prompts_async(dict_prompts, tok_params) + + self._apply_prompt_extras(tok_prompts, prompt_extras) + + # TODO: Apply multi-modal processor + return tok_prompts + + def render_chat( + self, + conversations: Sequence[list["ChatCompletionMessageParam"]], + chat_params: ChatParams, + tok_params: TokenizeParams, + *, + prompt_extras: dict[str, Any] | None = None, + ): + rendered = [ + self.render_messages(conversation, chat_params) + for conversation in conversations + ] + + out_conversations = list[list["ConversationMessage"]]() + dict_prompts = list[DictPrompt]() + for conv, prompt in rendered: + out_conversations.append(conv) + dict_prompts.append(prompt) + + tok_prompts = self.tokenize_prompts(dict_prompts, tok_params) + + self._apply_prompt_extras(tok_prompts, prompt_extras) + + # TODO: Apply multi-modal processor + return out_conversations, tok_prompts + + async def render_chat_async( + self, + conversations: Sequence[list["ChatCompletionMessageParam"]], + chat_params: ChatParams, + tok_params: TokenizeParams, + *, + prompt_extras: dict[str, Any] | None = None, + ): + rendered = [ + self.render_messages_async(conversation, chat_params) + for conversation in conversations + ] + + out_conversations = list[list["ConversationMessage"]]() + dict_prompts = list[DictPrompt]() + for conv, prompt in await asyncio.gather(*rendered): + out_conversations.append(conv) + dict_prompts.append(prompt) + + tok_prompts = await self.tokenize_prompts_async(dict_prompts, tok_params) + + self._apply_prompt_extras(tok_prompts, prompt_extras) + + # TODO: Apply multi-modal processor + return out_conversations, tok_prompts