diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index 3c14cf8a6..9a0d6d215 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -39,7 +39,11 @@ from vllm.multimodal.processing import ( ) from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.step3_vl import Step3VisionEncoderConfig -from vllm.transformers_utils.processors.step3_vl import Step3VLProcessor +from vllm.transformers_utils.processors.step3_vl import ( + MAX_IMAGE_SIZE, + Step3VLImageProcessor, + Step3VLProcessor, +) from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP @@ -86,21 +90,30 @@ Step3VLImageInputs: TypeAlias = Step3VLImagePixelInputs | Step3VLImageEmbeddingI class Step3VLProcessingInfo(BaseProcessingInfo): + def get_image_processor(self, **kwargs): + config = self.get_hf_config() + + kwargs.setdefault( + "enable_patch", + getattr(config.vision_config, "enable_patch", True), + ) + + return Step3VLImageProcessor(**kwargs) + def get_hf_processor(self) -> Step3VLProcessor: return Step3VLProcessor( - self.get_hf_config(), - self.get_tokenizer(), + tokenizer=self.get_tokenizer(), + image_processor=self.get_image_processor(), ) def get_supported_mm_limits(self) -> Mapping[str, int | None]: return {"image": None} def get_max_image_tokens(self) -> int: - hf_processor = self.get_hf_processor() - return hf_processor.get_num_image_tokens( - self.get_image_size_with_most_features().width, - self.get_image_size_with_most_features().height, - ) + image_processor = self.get_image_processor() + target_width, target_height = self.get_image_size_with_most_features() + + return image_processor.get_num_image_tokens(target_width, target_height) def get_mm_max_tokens_per_item( self, @@ -110,20 +123,7 @@ class Step3VLProcessingInfo(BaseProcessingInfo): return {"image": self.get_max_image_tokens()} def get_image_size_with_most_features(self) -> ImageSize: - return ImageSize(3024, 3024) - - def get_num_mm_tokens(self, mm_data: MultiModalDataDict) -> int: - if len(mm_data) != 1 or "image" not in mm_data: - raise ValueError("mm_data could only contain one key 'image' for steo1o") - - image_data = mm_data["image"] - if not isinstance(image_data, (list, tuple)): - image_data = [image_data] - - return sum( - self.get_hf_processor().get_num_image_tokens(img.width, img.height) - for img in image_data - ) + return ImageSize(MAX_IMAGE_SIZE, MAX_IMAGE_SIZE) class Step3VLDummyInputsBuilder(BaseDummyInputsBuilder[Step3VLProcessingInfo]): @@ -165,13 +165,11 @@ class Step3VLMultiModalProcessor(BaseMultiModalProcessor[Step3VLProcessingInfo]) def get_replacement_step1o(item_idx: int): out_item = out_mm_kwargs["image"][item_idx] num_patches = int(out_item["num_patches"].data) - if num_patches > 0: - patch_newline_mask = out_item["patch_newline_mask"].data - image_repl_ids = hf_processor._get_image_repl_features( - 1, num_patches, patch_newline_mask.tolist() - )[1] - else: - image_repl_ids = hf_processor._get_image_repl_features(1, 0, None)[1] + patch_newline_mask = out_item["patch_newline_mask"].data + image_repl_ids = hf_processor.get_image_repl_feature_ids( + 1, num_patches, patch_newline_mask.tolist() + ) + return PromptUpdateDetails.select_token_id( seq=image_repl_ids, embed_token_id=image_placeholder_token_id, diff --git a/vllm/transformers_utils/processors/internvl.py b/vllm/transformers_utils/processors/internvl.py index 41fed29af..fc582deef 100644 --- a/vllm/transformers_utils/processors/internvl.py +++ b/vllm/transformers_utils/processors/internvl.py @@ -558,6 +558,7 @@ class InternVLProcessor(ProcessorMixin): else: text_inputs = {} - combined_outputs = {**text_inputs, **image_inputs, **video_inputs} - - return BatchFeature(combined_outputs, tensor_type=return_tensors) + return BatchFeature( + data={**text_inputs, **image_inputs, **video_inputs}, + tensor_type=return_tensors, + ) diff --git a/vllm/transformers_utils/processors/kimi_k25.py b/vllm/transformers_utils/processors/kimi_k25.py index 6af16240d..06147f211 100644 --- a/vllm/transformers_utils/processors/kimi_k25.py +++ b/vllm/transformers_utils/processors/kimi_k25.py @@ -19,7 +19,6 @@ class KimiK25Processor(ProcessorMixin): self.media_token_id = media_token_id assert self.media_token_id is not None - # We do not support str input for text here def __call__( self, vision_chunks: list[VisionChunk] | None = None, diff --git a/vllm/transformers_utils/processors/step3_vl.py b/vllm/transformers_utils/processors/step3_vl.py index 358aedb41..66cf10e39 100644 --- a/vllm/transformers_utils/processors/step3_vl.py +++ b/vllm/transformers_utils/processors/step3_vl.py @@ -8,13 +8,13 @@ import torch from PIL import Image from torchvision import transforms from torchvision.transforms.functional import InterpolationMode -from transformers import BatchFeature, PretrainedConfig, TensorType +from transformers import BatchFeature, ProcessorMixin, TensorType from vllm.tokenizers import TokenizerLike MAX_IMAGE_SIZE: int = 3024 -ImageWithPatches = tuple[Image.Image, list[Image.Image], list[bool] | None] +ImageWithPatches = tuple[Image.Image, list[Image.Image], list[bool]] class Step3VisionProcessor: @@ -185,7 +185,7 @@ class ImagePatcher: def __call__( self, img: Image.Image - ) -> tuple[Image.Image, list[Image.Image], list[bool] | None]: + ) -> tuple[Image.Image, list[Image.Image], list[bool]]: img_width, img_height = img.size new_img_width, new_img_height = self.get_image_size_for_padding( img_width, img_height @@ -203,7 +203,7 @@ class ImagePatcher: ) if window_size == 0 or not self.enable_patch: - return img, [], None + return img, [], [] else: new_img_width, new_img_height = self.get_image_size_for_crop( new_img_width, new_img_height, window_size @@ -236,43 +236,28 @@ class ImagePatcher: return ( img, patches, - [i in newlines for i in range(len(patches))] - if len(patches) > 0 - else None, + [i in newlines for i in range(len(patches))], ) -class Step3VLProcessor: +class Step3VLImageProcessor: def __init__( self, - config: PretrainedConfig, - tokenizer: TokenizerLike, + image_size: int = 728, + patch_size: int = 504, + num_image_feature_size: int = 169, + num_patch_feature_size: int = 81, + enable_patch: bool = True, ) -> None: - super().__init__() - - self.config = config - self.tokenizer = tokenizer - self.image_size = 728 - self.patch_size = 504 + self.image_size = image_size + self.patch_size = patch_size + self.num_image_feature_size = num_image_feature_size + self.num_patch_feature_size = num_patch_feature_size self.image_preprocessor = Step3VisionProcessor( - self.image_size, "bilinear", self.patch_size + image_size, "bilinear", patch_size ) - - self.num_image_feature_size = 169 - self.num_patch_feature_size = 81 - self.image_token = "" - self.image_feature_placeholder = self.image_token * self.num_image_feature_size - self.patch_feature_placeholder = self.image_token * self.num_patch_feature_size - - # Respect vision config switch to enable/disable patch extraction. - # For video understanding, it's preferable to disable patch. - enable_patch = getattr(self.config.vision_config, "enable_patch", True) self.patcher = ImagePatcher(enable_patch=enable_patch) - @property - def image_token_id(self) -> int: - return self.tokenizer.get_vocab()[self.image_token] - def get_num_image_tokens(self, img_width: int, img_height: int) -> int: num_patches, num_newlines = self.patcher.get_num_patches(img_width, img_height) @@ -299,58 +284,168 @@ class Step3VLProcessor: for img in images ] - def _get_patch_repl( + def __call__( self, - num_patches: int, - patch_newline_mask: list[bool] | None, - ) -> tuple[str, list[int]]: - text = "" - token_ids = [] - for i in range(num_patches): - assert ( - patch_newline_mask is not None - and len(patch_newline_mask) == num_patches - ) - text += f"{self.patch_feature_placeholder}" - token_ids.extend( - [self.tokenizer.convert_tokens_to_ids("")] - + [self.image_token_id] * self.num_patch_feature_size - + [self.tokenizer.convert_tokens_to_ids("")] - ) - if patch_newline_mask and patch_newline_mask[i]: - text += "" - token_ids.append( - self.tokenizer.convert_tokens_to_ids("") - ) - return text, token_ids + images: Image.Image | list[Image.Image] | None = None, + return_tensors: str | TensorType | None = None, + ) -> BatchFeature: + if images is None: + images = [] + if not isinstance(images, list): + images = [images] - def _get_image_repl( + split_images_data = self._split_images(images) + pixel_values_lst = [] + patch_pixel_values_lst = [] + patch_newline_mask_lst = [] + num_patches = [] + for raw_img, img_patches, patch_newline_mask in split_images_data: + pixel_values_lst.extend(self._convert_images_to_pixel_values([raw_img])) + num_patches.append(len(img_patches)) + patch_pixel_values_lst.extend( + self._convert_images_to_pixel_values(img_patches, is_patch=True) + ) + patch_newline_mask_lst.extend(patch_newline_mask) + + pixel_values = torch.cat(pixel_values_lst) + patch_size = self.patch_size + image_inputs = { + "pixel_values": pixel_values, + "num_patches": num_patches, + "patch_pixel_values": ( + torch.cat(patch_pixel_values_lst) + if patch_pixel_values_lst + else pixel_values.new_empty((0, 3, patch_size, patch_size)) + ), + "patch_newline_mask": torch.tensor( + patch_newline_mask_lst, dtype=torch.bool + ), + } + return BatchFeature(image_inputs, tensor_type=return_tensors) + + +class Step3VLProcessor(ProcessorMixin): + attributes = ["image_processor", "tokenizer"] + + def __init__( self, - num_images: int, - ) -> tuple[str, list[int]]: - text = f"{self.image_feature_placeholder}" - token_ids = ( - [self.tokenizer.convert_tokens_to_ids("")] - + [self.image_token_id] * self.num_image_feature_size - + [self.tokenizer.convert_tokens_to_ids("")] + image_processor: Step3VLImageProcessor, + tokenizer: TokenizerLike, + ) -> None: + self.image_processor = image_processor + self.tokenizer = tokenizer + + self.image_start_token = image_start_token = "" + self.image_end_token = image_end_token = "" + self.patch_start_token = patch_start_token = "" + self.patch_end_token = patch_end_token = "" + self.patch_newline_token = patch_newline_token = "" + self.image_start_token_id = tokenizer.convert_tokens_to_ids(image_start_token) + self.image_end_token_id = tokenizer.convert_tokens_to_ids(image_end_token) + self.patch_start_token_id = tokenizer.convert_tokens_to_ids(patch_start_token) + self.patch_end_token_id = tokenizer.convert_tokens_to_ids(patch_end_token) + self.patch_newline_token_id = tokenizer.convert_tokens_to_ids( + patch_newline_token ) - return text * num_images, token_ids * num_images - def _get_image_repl_features( + self.image_token = image_token = "" + self.image_feature_tokens = image_token * image_processor.num_image_feature_size + self.patch_feature_tokens = image_token * image_processor.num_patch_feature_size + + self.image_token_id = image_token_id = tokenizer.convert_tokens_to_ids( + image_token + ) + self.image_feature_token_ids = [ + image_token_id + ] * image_processor.num_image_feature_size + self.patch_feature_token_ids = [ + image_token_id + ] * image_processor.num_patch_feature_size + + def _get_patch_repl_text( + self, + num_patches: int, + patch_newline_mask: list[bool], + ) -> str: + assert len(patch_newline_mask) == num_patches + + parts = [] + for i in range(num_patches): + parts.extend( + [ + self.patch_start_token, + self.patch_feature_tokens, + self.patch_end_token, + ] + ) + if patch_newline_mask[i]: + parts.append(self.patch_newline_token) + + return "".join(parts) + + def _get_patch_repl_ids( + self, + num_patches: int, + patch_newline_mask: list[bool], + ) -> list[int]: + assert len(patch_newline_mask) == num_patches + + parts = [] + for i in range(num_patches): + parts.extend( + [ + self.patch_start_token_id, + *self.patch_feature_token_ids, + self.patch_end_token_id, + ] + ) + if patch_newline_mask[i]: + parts.append(self.patch_newline_token_id) + + return parts + + def _get_image_repl_text( + self, + num_images: int, + ) -> str: + parts = [ + self.image_start_token, + self.image_feature_tokens, + self.image_end_token, + ] * num_images + + return "".join(parts) + + def _get_image_repl_ids( + self, + num_images: int, + ) -> list[int]: + part = [ + self.image_start_token_id, + *self.image_feature_token_ids, + self.image_end_token_id, + ] + return part * num_images + + def get_image_repl_feature_text( self, num_images: int, num_patches: int, - patch_new_line_idx: list[bool] | None, - ) -> tuple[str, list[int]]: - if num_patches > 0: - patch_repl, patch_repl_ids = self._get_patch_repl( - num_patches, patch_new_line_idx - ) - else: - patch_repl = "" - patch_repl_ids = [] - image_repl, image_repl_ids = self._get_image_repl(num_images) - return patch_repl + image_repl, patch_repl_ids + image_repl_ids + patch_new_line_idx: list[bool], + ) -> str: + patch_repl = self._get_patch_repl_text(num_patches, patch_new_line_idx) + image_repl = self._get_image_repl_text(num_images) + return patch_repl + image_repl + + def get_image_repl_feature_ids( + self, + num_images: int, + num_patches: int, + patch_new_line_idx: list[bool], + ) -> list[int]: + patch_repl = self._get_patch_repl_ids(num_patches, patch_new_line_idx) + image_repl = self._get_image_repl_ids(num_images) + return patch_repl + image_repl def replace_placeholder(self, text: str, placeholder: str, repls: list[str]) -> str: parts = text.split(placeholder) @@ -373,69 +468,44 @@ class Step3VLProcessor: images: Image.Image | list[Image.Image] | None = None, return_tensors: str | TensorType | None = None, ) -> BatchFeature: - if text is None: - text = [] - if not isinstance(text, list): - text = [text] - if images is None: - images = [] - if not isinstance(images, list): - images = [images] - - if len(images) == 0: + if images is not None: + image_inputs = self.image_processor( + images=images, + return_tensors=return_tensors, + ) + num_patches = image_inputs["num_patches"] + patch_newline_mask = image_inputs["patch_newline_mask"] + else: image_inputs = {} + num_patches = [] + patch_newline_mask = [] + + if text is not None: + if not isinstance(text, list): + text = [text] + + if image_inputs: + image_token = self.image_token + image_repl_str_lst = [] + start = 0 + for n_patches in num_patches: + image_repl_str = self.get_image_repl_feature_text( + 1, n_patches, patch_newline_mask[start : start + n_patches] + ) + image_repl_str_lst.append(image_repl_str) + + start += n_patches + + text = [ + self.replace_placeholder(t, image_token, image_repl_str_lst) + for t in text + ] + text_inputs = self.tokenizer(text) else: - split_images_data = self._split_images(images) - pixel_values_lst = [] - patch_pixel_values_lst = [] - patch_newline_mask_lst = [] - image_repl_str_lst = [] - image_repl_ids_lst = [] - num_patches = [] - for raw_img, img_patches, patch_newline_mask in split_images_data: - pixel_values_lst.extend(self._convert_images_to_pixel_values([raw_img])) - - if len(img_patches) > 0: - patch_pixel_values_lst.extend( - self._convert_images_to_pixel_values(img_patches, is_patch=True) - ) - num_patches.append(len(img_patches)) - - image_repl_str, image_repl_ids = self._get_image_repl_features( - 1, len(img_patches), patch_newline_mask - ) - image_repl_str_lst.append(image_repl_str) - image_repl_ids_lst.extend(image_repl_ids) - - if patch_newline_mask is not None: - patch_newline_mask_lst.extend(patch_newline_mask) - - pixel_values = torch.cat(pixel_values_lst) - patch_size = self.patch_size - image_inputs = { - "pixel_values": pixel_values, - "num_patches": num_patches, - "patch_pixel_values": ( - torch.cat(patch_pixel_values_lst) - if patch_pixel_values_lst - else pixel_values.new_empty((0, 3, patch_size, patch_size)) - ), - "patch_newline_mask": torch.tensor( - patch_newline_mask_lst, dtype=torch.bool - ), - } - - text = [ - self.replace_placeholder(t, self.image_token, image_repl_str_lst) - for t in text - ] - text_inputs = self.tokenizer(text) + text_inputs = {} return BatchFeature( - { - **text_inputs, - **image_inputs, - }, + data={**text_inputs, **image_inputs}, tensor_type=return_tensors, )