diff --git a/tests/conftest.py b/tests/conftest.py index 0d456fb36..9f811d5d8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -405,6 +405,7 @@ class HfRunner: images: PromptImageInput | None = None, videos: PromptVideoInput | None = None, audios: PromptAudioInput | None = None, + tokenization_kwargs: dict[str, Any] | None = None, ) -> list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]]: if images is not None: assert len(prompts) == len(images) @@ -418,10 +419,18 @@ class HfRunner: all_inputs: list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]] = [] for i, prompt in enumerate(prompts): if isinstance(prompt, str): - processor_kwargs: dict[str, Any] = { - "text": prompt, - "return_tensors": "pt", - } + # Create a copy to avoid modifying the original dict + processor_kwargs = ( + tokenization_kwargs.copy() + if tokenization_kwargs is not None + else {} + ) + processor_kwargs.update( + { + "text": prompt, + "return_tensors": "pt", + } + ) if images is not None and (image := images[i]) is not None: processor_kwargs["images"] = image if videos is not None and (video := videos[i]) is not None: diff --git a/tests/models/multimodal/pooling/test_siglip.py b/tests/models/multimodal/pooling/test_siglip.py index 92ae115a1..72886cbf7 100644 --- a/tests/models/multimodal/pooling/test_siglip.py +++ b/tests/models/multimodal/pooling/test_siglip.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any + import pytest from transformers import SiglipModel @@ -35,7 +37,11 @@ def _run_test( model: str, *, dtype: str, + tokenization_kwargs: dict[str, Any] | None = None, ) -> None: + if tokenization_kwargs is None: + tokenization_kwargs = {} + with vllm_runner( model, runner="pooling", @@ -44,10 +50,14 @@ def _run_test( max_model_len=64, gpu_memory_utilization=0.7, ) as vllm_model: - vllm_outputs = vllm_model.embed(input_texts, images=input_images) + vllm_outputs = vllm_model.embed( + input_texts, images=input_images, tokenization_kwargs=tokenization_kwargs + ) with hf_runner(model, dtype=dtype, auto_cls=SiglipModel) as hf_model: - all_inputs = hf_model.get_inputs(input_texts, images=input_images) + all_inputs = hf_model.get_inputs( + input_texts, images=input_images, tokenization_kwargs=tokenization_kwargs + ) all_outputs = [] for inputs in all_inputs: @@ -94,6 +104,10 @@ def test_models_text( input_images, # type: ignore model, dtype=dtype, + tokenization_kwargs={ + "padding": "max_length", + "max_length": 64, + }, # siglip2 was trained with this padding setting. ) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index add917634..913324fd5 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1076,6 +1076,7 @@ class LLM: params=pooling_params, use_tqdm=use_tqdm, lora_request=lora_request, + tokenization_kwargs=tokenization_kwargs, ) outputs = self._run_engine(use_tqdm=use_tqdm) @@ -1113,6 +1114,7 @@ class LLM: use_tqdm: bool | Callable[..., tqdm] = True, pooling_params: PoolingParams | Sequence[PoolingParams] | None = None, lora_request: list[LoRARequest] | LoRARequest | None = None, + tokenization_kwargs: dict[str, Any] | None = None, ) -> list[EmbeddingRequestOutput]: """ Generate an embedding vector for each prompt. @@ -1150,6 +1152,7 @@ class LLM: pooling_params=pooling_params, lora_request=lora_request, pooling_task="embed", + tokenization_kwargs=tokenization_kwargs, ) return [EmbeddingRequestOutput.from_base(item) for item in items] @@ -1161,6 +1164,7 @@ class LLM: use_tqdm: bool | Callable[..., tqdm] = True, pooling_params: PoolingParams | Sequence[PoolingParams] | None = None, lora_request: list[LoRARequest] | LoRARequest | None = None, + tokenization_kwargs: dict[str, Any] | None = None, ) -> list[ClassificationRequestOutput]: """ Generate class logits for each prompt. @@ -1196,6 +1200,7 @@ class LLM: pooling_params=pooling_params, lora_request=lora_request, pooling_task="classify", + tokenization_kwargs=tokenization_kwargs, ) return [ClassificationRequestOutput.from_base(item) for item in items] @@ -1209,6 +1214,7 @@ class LLM: use_tqdm: bool | Callable[..., tqdm] = True, pooling_params: PoolingParams | Sequence[PoolingParams] | None = None, lora_request: list[LoRARequest] | LoRARequest | None = None, + tokenization_kwargs: dict[str, Any] | None = None, ) -> list[PoolingRequestOutput]: """ Generate rewards for each prompt. @@ -1236,6 +1242,7 @@ class LLM: pooling_params=pooling_params, truncate_prompt_tokens=truncate_prompt_tokens, pooling_task="token_classify", + tokenization_kwargs=tokenization_kwargs, ) def _embedding_score( @@ -1247,6 +1254,7 @@ class LLM: use_tqdm: bool | Callable[..., tqdm] = True, pooling_params: PoolingParams | None = None, lora_request: list[LoRARequest] | LoRARequest | None = None, + tokenization_kwargs: dict[str, Any] | None = None, ) -> list[ScoringRequestOutput]: encoded_output: list[PoolingRequestOutput] = self.encode( text_1 + text_2, @@ -1255,6 +1263,7 @@ class LLM: lora_request=lora_request, pooling_params=pooling_params, pooling_task="embed", + tokenization_kwargs=tokenization_kwargs, ) encoded_output_1: list[PoolingRequestOutput] = encoded_output[0 : len(text_1)] @@ -1279,6 +1288,7 @@ class LLM: use_tqdm: bool | Callable[..., tqdm] = True, pooling_params: PoolingParams | None = None, lora_request: list[LoRARequest] | LoRARequest | None = None, + tokenization_kwargs: dict[str, Any] | None = None, ) -> list[ScoringRequestOutput]: model_config = self.model_config @@ -1294,7 +1304,8 @@ class LLM: pooling_params.verify("score", model_config) pooling_params_list = list[PoolingParams]() - tokenization_kwargs: dict[str, Any] = {} + local_kwargs = tokenization_kwargs or {} + tokenization_kwargs = local_kwargs.copy() _validate_truncation_size( model_config.max_model_len, truncate_prompt_tokens, tokenization_kwargs @@ -1557,6 +1568,7 @@ class LLM: use_tqdm: bool | Callable[..., tqdm] = True, lora_request: Sequence[LoRARequest] | LoRARequest | None, priority: list[int] | None = None, + tokenization_kwargs: dict[str, Any] | None = None, ) -> None: if isinstance(prompts, (str, dict)): # Convert a single prompt to a list. @@ -1602,6 +1614,7 @@ class LLM: if isinstance(lora_request, Sequence) else lora_request, priority=priority[i] if priority else 0, + tokenization_kwargs=tokenization_kwargs, ) added_request_ids.append(request_id) except Exception as e: @@ -1665,9 +1678,12 @@ class LLM: *, lora_request: LoRARequest | None, priority: int, + tokenization_kwargs: dict[str, Any] | None = None, ) -> tuple[EngineCoreRequest, dict[str, Any]]: """Use the Processor to process inputs for LLMEngine.""" - tokenization_kwargs: dict[str, Any] = {} + + local_kwargs = tokenization_kwargs or {} + tokenization_kwargs = local_kwargs.copy() _validate_truncation_size( self.model_config.max_model_len, params.truncate_prompt_tokens, @@ -1690,6 +1706,7 @@ class LLM: params: SamplingParams | PoolingParams, lora_request: LoRARequest | None = None, priority: int = 0, + tokenization_kwargs: dict[str, Any] | None = None, ) -> str: prompt_text, _, _ = get_prompt_components(prompt) request_id = str(next(self.request_counter)) @@ -1700,6 +1717,7 @@ class LLM: params, lora_request=lora_request, priority=priority, + tokenization_kwargs=tokenization_kwargs, ) self.llm_engine.add_request(