[doc] Fold long code blocks to improve readability (#19926)

Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-23 13:24:23 +08:00
parent 493c275352
commit f17aec0d63
50 changed files with 3455 additions and 3180 deletions
--- a/docs/contributing/model/multimodal.md
+++ b/docs/contributing/model/multimodal.md
@@ -25,59 +25,63 @@ Further update the model as follows:

 - Implement [get_multimodal_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings] that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs.

-    ```python
-    class YourModelForImage2Seq(nn.Module):
-        ...
+    ??? Code

-        def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
+        ```python
+        class YourModelForImage2Seq(nn.Module):
+            ...

-            assert self.vision_encoder is not None
-            image_features = self.vision_encoder(image_input)
-            return self.multi_modal_projector(image_features)
+            def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:

-        def get_multimodal_embeddings(
-                self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+                assert self.vision_encoder is not None
+                image_features = self.vision_encoder(image_input)
+                return self.multi_modal_projector(image_features)

-            # Validate the multimodal input keyword arguments
-            image_input = self._parse_and_validate_image_input(**kwargs)
-            if image_input is None:
-                return None
+            def get_multimodal_embeddings(
+                    self, **kwargs: object) -> Optional[MultiModalEmbeddings]:

-            # Run multimodal inputs through encoder and projector
-            vision_embeddings = self._process_image_input(image_input)
-            return vision_embeddings
-    ```
+                # Validate the multimodal input keyword arguments
+                image_input = self._parse_and_validate_image_input(**kwargs)
+                if image_input is None:
+                    return None
+
+                # Run multimodal inputs through encoder and projector
+                vision_embeddings = self._process_image_input(image_input)
+                return vision_embeddings
+        ```

 !!! important
    The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request.

 - Implement [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings] to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings.

-    ```python
-    from .utils import merge_multimodal_embeddings
+    ??? Code

-    class YourModelForImage2Seq(nn.Module):
-        ...
+        ```python
+        from .utils import merge_multimodal_embeddings

-        def get_input_embeddings(
-            self,
-            input_ids: torch.Tensor,
-            multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-        ) -> torch.Tensor:
+        class YourModelForImage2Seq(nn.Module):
+            ...

-            # `get_input_embeddings` should already be implemented for the language 
-            # model as one of the requirements of basic vLLM model implementation.
-            inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+            def get_input_embeddings(
+                self,
+                input_ids: torch.Tensor,
+                multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+            ) -> torch.Tensor:

-            if multimodal_embeddings is not None:
-                inputs_embeds = merge_multimodal_embeddings(
-                    input_ids=input_ids, 
-                    inputs_embeds=inputs_embeds, 
-                    multimodal_embeddings=multimodal_embeddings,
-                    placeholder_token_id=self.config.image_token_index)
+                # `get_input_embeddings` should already be implemented for the language 
+                # model as one of the requirements of basic vLLM model implementation.
+                inputs_embeds = self.language_model.get_input_embeddings(input_ids)

-            return inputs_embeds
-    ```
+                if multimodal_embeddings is not None:
+                    inputs_embeds = merge_multimodal_embeddings(
+                        input_ids=input_ids, 
+                        inputs_embeds=inputs_embeds, 
+                        multimodal_embeddings=multimodal_embeddings,
+                        placeholder_token_id=self.config.image_token_index)
+
+                return inputs_embeds
+        ```

 - Implement [get_language_model][vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model] getter to provide stable access to the underlying language model.

@@ -135,42 +139,46 @@ Assuming that the memory usage increases with the number of tokens, the dummy in

    Looking at the code of HF's `LlavaForConditionalGeneration`:

-    ```python
-    # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544
-    n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
-    n_image_features = image_features.shape[0] * image_features.shape[1]
+    ??? Code

-    if n_image_tokens != n_image_features:
-        raise ValueError(
-            f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+        ```python
+        # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544
+        n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
+        n_image_features = image_features.shape[0] * image_features.shape[1]
+
+        if n_image_tokens != n_image_features:
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+            )
+        special_image_mask = (
+            (input_ids == self.config.image_token_index)
+            .unsqueeze(-1)
+            .expand_as(inputs_embeds)
+            .to(inputs_embeds.device)
        )
-    special_image_mask = (
-        (input_ids == self.config.image_token_index)
-        .unsqueeze(-1)
-        .expand_as(inputs_embeds)
-        .to(inputs_embeds.device)
-    )
-    image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
-    inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
-    ```
+        image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+        inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+        ```

    The number of placeholder feature tokens per image is `image_features.shape[1]`.
    `image_features` is calculated inside the `get_image_features` method:

-    ```python
-    # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300
-    image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
+    ??? Code

-    selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
-    if vision_feature_select_strategy == "default":
-        selected_image_feature = selected_image_feature[:, 1:]
-    elif vision_feature_select_strategy == "full":
-        selected_image_feature = selected_image_feature
-    else:
-        raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
-    image_features = self.multi_modal_projector(selected_image_feature)
-    return image_features
-    ```
+        ```python
+        # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300
+        image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
+
+        selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
+        if vision_feature_select_strategy == "default":
+            selected_image_feature = selected_image_feature[:, 1:]
+        elif vision_feature_select_strategy == "full":
+            selected_image_feature = selected_image_feature
+        else:
+            raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
+        image_features = self.multi_modal_projector(selected_image_feature)
+        return image_features
+        ```

    We can infer that `image_features.shape[1]` is based on `image_outputs.hidden_states.shape[1]` from the vision tower
    (`CLIPVisionModel` for the [`llava-hf/llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) model).
@@ -193,20 +201,22 @@ Assuming that the memory usage increases with the number of tokens, the dummy in

    To find the sequence length, we turn to the code of `CLIPVisionEmbeddings`:

-    ```python
-    # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257
-    target_dtype = self.patch_embedding.weight.dtype
-    patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
-    patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+    ??? Code

-    class_embeds = self.class_embedding.expand(batch_size, 1, -1)
-    embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
-    if interpolate_pos_encoding:
-        embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
-    else:
-        embeddings = embeddings + self.position_embedding(self.position_ids)
-    return embeddings
-    ```
+        ```python
+        # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        if interpolate_pos_encoding:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            embeddings = embeddings + self.position_embedding(self.position_ids)
+        return embeddings
+        ```

    We can infer that `embeddings.shape[1] == self.num_positions`, where

@@ -218,55 +228,59 @@ Assuming that the memory usage increases with the number of tokens, the dummy in

    Overall, the number of placeholder feature tokens for an image can be calculated as:

-    ```python
-    def get_num_image_tokens(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-    ) -> int:
-        hf_config = self.get_hf_config()
-        hf_processor = self.get_hf_processor()
+    ??? Code

-        image_size = hf_config.vision_config.image_size
-        patch_size = hf_config.vision_config.patch_size
+        ```python
+        def get_num_image_tokens(
+            self,
+            *,
+            image_width: int,
+            image_height: int,
+        ) -> int:
+            hf_config = self.get_hf_config()
+            hf_processor = self.get_hf_processor()

-        num_image_tokens = (image_size // patch_size) ** 2 + 1
-        if hf_processor.vision_feature_select_strategy == "default":
-            num_image_tokens -= 1
+            image_size = hf_config.vision_config.image_size
+            patch_size = hf_config.vision_config.patch_size

-        return num_image_tokens
-    ```
+            num_image_tokens = (image_size // patch_size) ** 2 + 1
+            if hf_processor.vision_feature_select_strategy == "default":
+                num_image_tokens -= 1
+
+            return num_image_tokens
+        ```

    Notice that the number of image tokens doesn't depend on the image width and height.
    We can simply use a dummy `image_size` to calculate the multimodal profiling data:

-    ```python
-    # NOTE: In actuality, this is usually implemented as part of the
-    # model's subclass of `BaseProcessingInfo`, but we show it as is
-    # here for simplicity.
-    def get_image_size_with_most_features(self) -> ImageSize:
-        hf_config = self.get_hf_config()
-        width = height = hf_config.image_size
-        return ImageSize(width=width, height=height)
+    ??? Code

-    def get_dummy_mm_data(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> MultiModalDataDict:
-        num_images = mm_counts.get("image", 0)
+        ```python
+        # NOTE: In actuality, this is usually implemented as part of the
+        # model's subclass of `BaseProcessingInfo`, but we show it as is
+        # here for simplicity.
+        def get_image_size_with_most_features(self) -> ImageSize:
+            hf_config = self.get_hf_config()
+            width = height = hf_config.image_size
+            return ImageSize(width=width, height=height)

-        target_width, target_height = \
-            self.info.get_image_size_with_most_features()
+        def get_dummy_mm_data(
+            self,
+            seq_len: int,
+            mm_counts: Mapping[str, int],
+        ) -> MultiModalDataDict:
+            num_images = mm_counts.get("image", 0)

-        return {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images)
-        }
-    ```
+            target_width, target_height = \
+                self.info.get_image_size_with_most_features()
+
+            return {
+                "image":
+                self._get_dummy_images(width=target_width,
+                                    height=target_height,
+                                    num_images=num_images)
+            }
+        ```

    For the text, we simply expand the multimodal image token from the model config to match the desired number of images.

@@ -284,21 +298,23 @@ Assuming that the memory usage increases with the number of tokens, the dummy in

    Looking at the code of HF's `FuyuForCausalLM`:

-    ```python
-    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/modeling_fuyu.py#L311-L322
-    if image_patches is not None and past_key_values is None:
-        patch_embeddings = [
-            self.vision_embed_tokens(patch.to(self.vision_embed_tokens.weight.dtype))
-            .squeeze(0)
-            .to(inputs_embeds.device)
-            for patch in image_patches
-        ]
-        inputs_embeds = self.gather_continuous_embeddings(
-            word_embeddings=inputs_embeds,
-            continuous_embeddings=patch_embeddings,
-            image_patch_input_indices=image_patches_indices,
-        )
-    ```
+    ??? Code
+
+        ```python
+        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/modeling_fuyu.py#L311-L322
+        if image_patches is not None and past_key_values is None:
+            patch_embeddings = [
+                self.vision_embed_tokens(patch.to(self.vision_embed_tokens.weight.dtype))
+                .squeeze(0)
+                .to(inputs_embeds.device)
+                for patch in image_patches
+            ]
+            inputs_embeds = self.gather_continuous_embeddings(
+                word_embeddings=inputs_embeds,
+                continuous_embeddings=patch_embeddings,
+                image_patch_input_indices=image_patches_indices,
+            )
+        ```

    The number of placeholder feature tokens for the `i`th item in the batch is `patch_embeddings[i].shape[0]`,
    which is the same as `image_patches[i].shape[0]`, i.e. `num_total_patches`.
@@ -312,92 +328,98 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
    In `FuyuImageProcessor.preprocess`, the images are resized and padded to the target `FuyuImageProcessor.size`,
    returning the dimensions after resizing (but before padding) as metadata.

-    ```python
-    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L541-L544
-    image_encoding = self.image_processor.preprocess(images, **output_kwargs["images_kwargs"])
-    batch_images = image_encoding["images"]
-    image_unpadded_heights = image_encoding["image_unpadded_heights"]
-    image_unpadded_widths = image_encoding["image_unpadded_widths"]
+    ??? Code

-    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L480-L
-    if do_resize:
-        batch_images = [
-            [self.resize(image, size=size, input_data_format=input_data_format) for image in images]
-            for images in batch_images
-        ]
+        ```python
+        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L541-L544
+        image_encoding = self.image_processor.preprocess(images, **output_kwargs["images_kwargs"])
+        batch_images = image_encoding["images"]
+        image_unpadded_heights = image_encoding["image_unpadded_heights"]
+        image_unpadded_widths = image_encoding["image_unpadded_widths"]

-    image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images]
-    image_unpadded_heights = [[image_size[0]] for image_size in image_sizes]
-    image_unpadded_widths = [[image_size[1]] for image_size in image_sizes]
-
-    if do_pad:
-        batch_images = [
-            [
-                self.pad_image(
-                    image,
-                    size=size,
-                    mode=padding_mode,
-                    constant_values=padding_value,
-                    input_data_format=input_data_format,
-                )
-                for image in images
+        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L480-L
+        if do_resize:
+            batch_images = [
+                [self.resize(image, size=size, input_data_format=input_data_format) for image in images]
+                for images in batch_images
            ]
-            for images in batch_images
-        ]
-    ```
+
+        image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images]
+        image_unpadded_heights = [[image_size[0]] for image_size in image_sizes]
+        image_unpadded_widths = [[image_size[1]] for image_size in image_sizes]
+
+        if do_pad:
+            batch_images = [
+                [
+                    self.pad_image(
+                        image,
+                        size=size,
+                        mode=padding_mode,
+                        constant_values=padding_value,
+                        input_data_format=input_data_format,
+                    )
+                    for image in images
+                ]
+                for images in batch_images
+            ]
+        ```

    In `FuyuImageProcessor.preprocess_with_tokenizer_info`, the images are split into patches based on this metadata:

-    ```python
-    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L425
-    model_image_input = self.image_processor.preprocess_with_tokenizer_info(
-        image_input=tensor_batch_images,
-        image_present=image_present,
-        image_unpadded_h=image_unpadded_heights,
-        image_unpadded_w=image_unpadded_widths,
-        image_placeholder_id=image_placeholder_id,
-        image_newline_id=image_newline_id,
-        variable_sized=True,
-    )
+    ??? Code

-    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L638-L658
-    image_height, image_width = image.shape[1], image.shape[2]
-    if variable_sized:  # variable_sized=True
-        new_h = min(
-            image_height,
-            math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height,
+        ```python
+        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L425
+        model_image_input = self.image_processor.preprocess_with_tokenizer_info(
+            image_input=tensor_batch_images,
+            image_present=image_present,
+            image_unpadded_h=image_unpadded_heights,
+            image_unpadded_w=image_unpadded_widths,
+            image_placeholder_id=image_placeholder_id,
+            image_newline_id=image_newline_id,
+            variable_sized=True,
        )
-        new_w = min(
-            image_width,
-            math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width,
-        )
-        image = image[:, :new_h, :new_w]
-        image_height, image_width = new_h, new_w

-    num_patches = self.get_num_patches(image_height=image_height, image_width=image_width)
-    tensor_of_image_ids = torch.full(
-        [num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device
-    )
-    patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0)
-    assert num_patches == patches.shape[0]
-    ```
+        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L638-L658
+        image_height, image_width = image.shape[1], image.shape[2]
+        if variable_sized:  # variable_sized=True
+            new_h = min(
+                image_height,
+                math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height,
+            )
+            new_w = min(
+                image_width,
+                math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width,
+            )
+            image = image[:, :new_h, :new_w]
+            image_height, image_width = new_h, new_w
+
+        num_patches = self.get_num_patches(image_height=image_height, image_width=image_width)
+        tensor_of_image_ids = torch.full(
+            [num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device
+        )
+        patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0)
+        assert num_patches == patches.shape[0]
+        ```

    The number of patches is in turn defined by `FuyuImageProcessor.get_num_patches`:

-    ```python
-    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L552-L562
-    patch_size = patch_size if patch_size is not None else self.patch_size
-    patch_height, patch_width = self.patch_size["height"], self.patch_size["width"]
+    ??? Code

-    if image_height % patch_height != 0:
-        raise ValueError(f"{image_height=} must be divisible by {patch_height}")
-    if image_width % patch_width != 0:
-        raise ValueError(f"{image_width=} must be divisible by {patch_width}")
+        ```python
+        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L552-L562
+        patch_size = patch_size if patch_size is not None else self.patch_size
+        patch_height, patch_width = self.patch_size["height"], self.patch_size["width"]

-    num_patches_per_dim_h = image_height // patch_height
-    num_patches_per_dim_w = image_width // patch_width
-    num_patches = num_patches_per_dim_h * num_patches_per_dim_w
-    ```
+        if image_height % patch_height != 0:
+            raise ValueError(f"{image_height=} must be divisible by {patch_height}")
+        if image_width % patch_width != 0:
+            raise ValueError(f"{image_width=} must be divisible by {patch_width}")
+
+        num_patches_per_dim_h = image_height // patch_height
+        num_patches_per_dim_w = image_width // patch_width
+        num_patches = num_patches_per_dim_h * num_patches_per_dim_w
+        ```

    These image patches correspond to placeholder tokens (`|SPEAKER|`). So, we just need to maximize the number of image patches. Since input images are first resized
    to fit within `image_processor.size`, we can maximize the number of image patches by inputting an image with size equal to `image_processor.size`.
@@ -419,23 +441,25 @@ Assuming that the memory usage increases with the number of tokens, the dummy in

    For the multimodal image profiling data, the logic is very similar to LLaVA:

-    ```python
-    def get_dummy_mm_data(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> MultiModalDataDict:
-        target_width, target_height = \
-            self.info.get_image_size_with_most_features()
-        num_images = mm_counts.get("image", 0)
+    ??? Code

-        return {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images)
-        }
-    ```
+        ```python
+        def get_dummy_mm_data(
+            self,
+            seq_len: int,
+            mm_counts: Mapping[str, int],
+        ) -> MultiModalDataDict:
+            target_width, target_height = \
+                self.info.get_image_size_with_most_features()
+            num_images = mm_counts.get("image", 0)
+
+            return {
+                "image":
+                self._get_dummy_images(width=target_width,
+                                    height=target_height,
+                                    num_images=num_images)
+            }
+        ```

 ## 4. Specify processing details

@@ -455,6 +479,7 @@ return a schema of the tensors outputted by the HF processor that are related to
    The output of `CLIPImageProcessor` is a simple tensor with shape
    `(num_images, num_channels, image_height, image_width)`:

+
    ```python
    # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/image_processing_clip.py#L339-L345
    images = [
@@ -505,35 +530,37 @@ return a schema of the tensors outputted by the HF processor that are related to
    In order to support the use of [MultiModalFieldConfig.batched][] like in LLaVA,
    we remove the extra batch dimension by overriding [BaseMultiModalProcessor._call_hf_processor][]:

-    ```python
-    def _call_hf_processor(
-        self,
-        prompt: str,
-        mm_data: Mapping[str, object],
-        mm_kwargs: Mapping[str, object],
-    ) -> BatchFeature:
-        processed_outputs = super()._call_hf_processor(
-            prompt=prompt,
-            mm_data=mm_data,
-            mm_kwargs=mm_kwargs,
-        )
+    ??? Code

-        image_patches = processed_outputs.get("image_patches")
-        if image_patches is not None:
-            images = mm_data["images"]
-            assert isinstance(images, list)
+        ```python
+        def _call_hf_processor(
+            self,
+            prompt: str,
+            mm_data: Mapping[str, object],
+            mm_kwargs: Mapping[str, object],
+        ) -> BatchFeature:
+            processed_outputs = super()._call_hf_processor(
+                prompt=prompt,
+                mm_data=mm_data,
+                mm_kwargs=mm_kwargs,
+            )

-            # Original output: (1, num_images, Pn, Px * Py * C)
-            # New output: (num_images, Pn, Px * Py * C)
-            assert (isinstance(image_patches, list)
-                    and len(image_patches) == 1)
-            assert (isinstance(image_patches[0], torch.Tensor)
-                    and len(image_patches[0]) == len(images))
+            image_patches = processed_outputs.get("image_patches")
+            if image_patches is not None:
+                images = mm_data["images"]
+                assert isinstance(images, list)

-            processed_outputs["image_patches"] = image_patches[0]
+                # Original output: (1, num_images, Pn, Px * Py * C)
+                # New output: (num_images, Pn, Px * Py * C)
+                assert (isinstance(image_patches, list)
+                        and len(image_patches) == 1)
+                assert (isinstance(image_patches[0], torch.Tensor)
+                        and len(image_patches[0]) == len(images))

-        return processed_outputs
-    ```
+                processed_outputs["image_patches"] = image_patches[0]
+
+            return processed_outputs
+        ```

    !!! note
        Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling
@@ -573,35 +600,37 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
    It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`).
    Based on this, we override [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] as follows:

-    ```python
-    def _get_prompt_updates(
-        self,
-        mm_items: MultiModalDataItems,
-        hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
-    ) -> Sequence[PromptUpdate]:
-        hf_config = self.info.get_hf_config()
-        image_token_id = hf_config.image_token_index
+    ??? Code

-        def get_replacement(item_idx: int):
-            images = mm_items.get_items("image", ImageProcessorItems)
+        ```python
+        def _get_prompt_updates(
+            self,
+            mm_items: MultiModalDataItems,
+            hf_processor_mm_kwargs: Mapping[str, object],
+            out_mm_kwargs: MultiModalKwargs,
+        ) -> Sequence[PromptUpdate]:
+            hf_config = self.info.get_hf_config()
+            image_token_id = hf_config.image_token_index

-            image_size = images.get_image_size(item_idx)
-            num_image_tokens = self.info.get_num_image_tokens(
-                image_width=image_size.width,
-                image_height=image_size.height,
-            )
+            def get_replacement(item_idx: int):
+                images = mm_items.get_items("image", ImageProcessorItems)

-            return [image_token_id] * num_image_tokens
+                image_size = images.get_image_size(item_idx)
+                num_image_tokens = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                )

-        return [
-            PromptReplacement(
-                modality="image",
-                target=[image_token_id],
-                replacement=get_replacement,
-            ),
-        ]
-    ```
+                return [image_token_id] * num_image_tokens
+
+            return [
+                PromptReplacement(
+                    modality="image",
+                    target=[image_token_id],
+                    replacement=get_replacement,
+                ),
+            ]
+        ```

 === "Handling additional tokens: Fuyu"

@@ -616,117 +645,90 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies

    We define a helper function to return `ncols` and `nrows` directly:

-    ```python
-    def get_image_feature_grid_size(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-    ) -> tuple[int, int]:
-        image_processor = self.get_image_processor()
-        target_width = image_processor.size["width"]
-        target_height = image_processor.size["height"]
-        patch_width = image_processor.patch_size["width"]
-        patch_height = image_processor.patch_size["height"]
+    ??? Code

-        if not (image_width <= target_width and image_height <= target_height):
-            height_scale_factor = target_height / image_height
-            width_scale_factor = target_width / image_width
-            optimal_scale_factor = min(height_scale_factor, width_scale_factor)
+        ```python
+        def get_image_feature_grid_size(
+            self,
+            *,
+            image_width: int,
+            image_height: int,
+        ) -> tuple[int, int]:
+            image_processor = self.get_image_processor()
+            target_width = image_processor.size["width"]
+            target_height = image_processor.size["height"]
+            patch_width = image_processor.patch_size["width"]
+            patch_height = image_processor.patch_size["height"]

-            image_height = int(image_height * optimal_scale_factor)
-            image_width = int(image_width * optimal_scale_factor)
+            if not (image_width <= target_width and image_height <= target_height):
+                height_scale_factor = target_height / image_height
+                width_scale_factor = target_width / image_width
+                optimal_scale_factor = min(height_scale_factor, width_scale_factor)

-        ncols = math.ceil(image_width / patch_width)
-        nrows = math.ceil(image_height / patch_height)
-        return ncols, nrows
-    ```
+                image_height = int(image_height * optimal_scale_factor)
+                image_width = int(image_width * optimal_scale_factor)
+
+            ncols = math.ceil(image_width / patch_width)
+            nrows = math.ceil(image_height / patch_height)
+            return ncols, nrows
+        ```

    Based on this, we can initially define our replacement tokens as:

-    ```python
-    def get_replacement(item_idx: int):
-        images = mm_items.get_items("image", ImageProcessorItems)
-        image_size = images.get_image_size(item_idx)
+    ??? Code

-        ncols, nrows = self.info.get_image_feature_grid_size(
-            image_width=image_size.width,
-            image_height=image_size.height,
-        )
+        ```python
+        def get_replacement(item_idx: int):
+            images = mm_items.get_items("image", ImageProcessorItems)
+            image_size = images.get_image_size(item_idx)

-        # `_IMAGE_TOKEN_ID` corresponds to `|SPEAKER|`
-        # `_NEWLINE_TOKEN_ID` corresponds to `|NEWLINE|`
-        return ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
-    ```
+            ncols, nrows = self.info.get_image_feature_grid_size(
+                image_width=image_size.width,
+                image_height=image_size.height,
+            )
+
+            # `_IMAGE_TOKEN_ID` corresponds to `|SPEAKER|`
+            # `_NEWLINE_TOKEN_ID` corresponds to `|NEWLINE|`
+            return ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
+        ```

    However, this is not entirely correct. After `FuyuImageProcessor.preprocess_with_tokenizer_info` is called,
    a BOS token (`<s>`) is also added to the promopt:

-    ```python
-    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L435
-    model_image_input = self.image_processor.preprocess_with_tokenizer_info(
-        image_input=tensor_batch_images,
-        image_present=image_present,
-        image_unpadded_h=image_unpadded_heights,
-        image_unpadded_w=image_unpadded_widths,
-        image_placeholder_id=image_placeholder_id,
-        image_newline_id=image_newline_id,
-        variable_sized=True,
-    )
-    prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
-        tokenizer=self.tokenizer,
-        prompts=prompts,
-        scale_factors=scale_factors,
-        max_tokens_to_generate=self.max_tokens_to_generate,
-        max_position_embeddings=self.max_position_embeddings,
-        add_BOS=True,
-        add_beginning_of_answer_token=True,
-    )
-    ```
+    ??? Code
+
+        ```python
+        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L435
+        model_image_input = self.image_processor.preprocess_with_tokenizer_info(
+            image_input=tensor_batch_images,
+            image_present=image_present,
+            image_unpadded_h=image_unpadded_heights,
+            image_unpadded_w=image_unpadded_widths,
+            image_placeholder_id=image_placeholder_id,
+            image_newline_id=image_newline_id,
+            variable_sized=True,
+        )
+        prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
+            tokenizer=self.tokenizer,
+            prompts=prompts,
+            scale_factors=scale_factors,
+            max_tokens_to_generate=self.max_tokens_to_generate,
+            max_position_embeddings=self.max_position_embeddings,
+            add_BOS=True,
+            add_beginning_of_answer_token=True,
+        )
+        ```

    To assign the vision embeddings to only the image tokens, instead of a string
    you can return an instance of [PromptUpdateDetails][vllm.multimodal.processing.PromptUpdateDetails]:

-    ```python
-    hf_config = self.info.get_hf_config()
-    bos_token_id = hf_config.bos_token_id  # `<s>`
-    assert isinstance(bos_token_id, int)
+    ??? Code

-    def get_replacement_fuyu(item_idx: int):
-        images = mm_items.get_items("image", ImageProcessorItems)
-        image_size = images.get_image_size(item_idx)
-
-        ncols, nrows = self.info.get_image_feature_grid_size(
-            image_width=image_size.width,
-            image_height=image_size.height,
-        )
-        image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
-                        [_NEWLINE_TOKEN_ID]) * nrows
-
-        return PromptUpdateDetails.select_token_id(
-            image_tokens + [bos_token_id],
-            embed_token_id=_IMAGE_TOKEN_ID,
-        )
-    ```
-
-    Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the tokenized prompt,
-    we can search for it to conduct the replacement at the start of the string:
-
-    ```python
-    def _get_prompt_updates(
-        self,
-        mm_items: MultiModalDataItems,
-        hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
-    ) -> Sequence[PromptUpdate]:
+        ```python
        hf_config = self.info.get_hf_config()
-        bos_token_id = hf_config.bos_token_id
+        bos_token_id = hf_config.bos_token_id  # `<s>`
        assert isinstance(bos_token_id, int)

-        tokenizer = self.info.get_tokenizer()
-        eot_token_id = tokenizer.bos_token_id
-        assert isinstance(eot_token_id, int)
-
        def get_replacement_fuyu(item_idx: int):
            images = mm_items.get_items("image", ImageProcessorItems)
            image_size = images.get_image_size(item_idx)
@@ -742,15 +744,52 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
                image_tokens + [bos_token_id],
                embed_token_id=_IMAGE_TOKEN_ID,
            )
+        ```

-        return [
-            PromptReplacement(
-                modality="image",
-                target=[eot_token_id],
-                replacement=get_replacement_fuyu,
-            )
-        ]
-    ```
+    Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the tokenized prompt,
+    we can search for it to conduct the replacement at the start of the string:
+
+    ??? Code
+
+        ```python
+        def _get_prompt_updates(
+            self,
+            mm_items: MultiModalDataItems,
+            hf_processor_mm_kwargs: Mapping[str, object],
+            out_mm_kwargs: MultiModalKwargs,
+        ) -> Sequence[PromptUpdate]:
+            hf_config = self.info.get_hf_config()
+            bos_token_id = hf_config.bos_token_id
+            assert isinstance(bos_token_id, int)
+
+            tokenizer = self.info.get_tokenizer()
+            eot_token_id = tokenizer.bos_token_id
+            assert isinstance(eot_token_id, int)
+
+            def get_replacement_fuyu(item_idx: int):
+                images = mm_items.get_items("image", ImageProcessorItems)
+                image_size = images.get_image_size(item_idx)
+
+                ncols, nrows = self.info.get_image_feature_grid_size(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                )
+                image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
+                                [_NEWLINE_TOKEN_ID]) * nrows
+
+                return PromptUpdateDetails.select_token_id(
+                    image_tokens + [bos_token_id],
+                    embed_token_id=_IMAGE_TOKEN_ID,
+                )
+
+            return [
+                PromptReplacement(
+                    modality="image",
+                    target=[eot_token_id],
+                    replacement=get_replacement_fuyu,
+                )
+            ]
+        ```

 ## 5. Register processor-related classes