[1/2] Move InternVL-based processors (#37260)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-03-17 21:50:56 +08:00
parent 2660b9289c
commit f340324335
20 changed files with 3252 additions and 3099 deletions
--- a/tests/models/multimodal/processing/test_h2ovl.py
+++ b/tests/models/multimodal/processing/test_h2ovl.py
@@ -23,7 +23,7 @@ def _get_expected_num_patches(
    min_num: int,
    max_num: int,
 ):
-    from vllm.model_executor.models.h2ovl import (
+    from vllm.transformers_utils.processors.h2ovl import (
        calculate_h2ovl_targets,
        get_h2ovl_target_ratios,
    )
--- a/tests/models/multimodal/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
@@ -23,7 +23,7 @@ def _get_expected_num_patches(
    min_num: int,
    max_num: int,
 ):
-    from vllm.model_executor.models.internvl import (
+    from vllm.transformers_utils.processors.internvl import (
        calculate_internvl_targets,
        get_internvl_target_ratios,
    )
--- a/tests/models/multimodal/processing/test_nemotron_vl.py
+++ b/tests/models/multimodal/processing/test_nemotron_vl.py
@@ -23,7 +23,7 @@ def _get_expected_num_patches(
    min_num: int,
    max_num: int,
 ):
-    from vllm.model_executor.models.nemotron_vl import (
+    from vllm.transformers_utils.processors.nemotron_vl import (
        calculate_nemotron_vl_targets,
        get_nemotron_vl_target_ratios,
    )
--- a/vllm/model_executor/models/eagle2_5_vl.py
+++ b/vllm/model_executor/models/eagle2_5_vl.py
@@ -15,9 +15,8 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.siglip import SiglipVisionModel
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.processing import PromptUpdateDetails
 from vllm.sequence import IntermediateTensors
-from vllm.tokenizers import TokenizerLike
+from vllm.transformers_utils.processors.eagle2_5_vl import Eagle2_5_VLProcessor
 from vllm.utils.tensor_schema import TensorSchema, TensorShape

 from .interfaces import (
@@ -27,13 +26,9 @@ from .interfaces import (
    SupportsPP,
 )
 from .internvl import (
-    IMG_CONTEXT,
-    IMG_END,
-    IMG_START,
    BaseInternVLDummyInputsBuilder,
    BaseInternVLMultiModalProcessor,
    BaseInternVLProcessingInfo,
-    BaseInternVLProcessor,
 )
 from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix

@@ -70,81 +65,6 @@ Eagle2_5_VLImageInputs: TypeAlias = (
 )


-class Eagle2_5_VLProcessor(BaseInternVLProcessor):
-    """
-    Custom processor for Eagle2.5-VL model.
-    Extends BaseInternVLProcessor with Eagle-specific token handling.
-    """
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        tokenizer: TokenizerLike,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-    ) -> None:
-        # Skip super().__init__() to avoid config manipulation
-        # Directly initialize all required attributes
-        self.config = config
-        self.tokenizer = tokenizer
-
-        # Image size with force_image_size override
-        image_size: int = config.vision_config.image_size
-        if hasattr(config, "force_image_size") and config.force_image_size:
-            image_size = config.force_image_size
-
-        patch_size: int = config.vision_config.patch_size
-        downsample_ratio: float = getattr(config, "downsample_ratio", 0.5)
-
-        # Compute num_image_token
-        self.num_image_token = int(
-            (image_size // patch_size) ** 2 * (downsample_ratio**2)
-        )
-        self.image_size = image_size
-
-        # Dynamic patch settings with defaults
-        self.min_dynamic_patch = (
-            min_dynamic_patch
-            if min_dynamic_patch is not None
-            else getattr(config, "min_dynamic_patch", 1)
-        )
-        self.max_dynamic_patch = (
-            max_dynamic_patch
-            if max_dynamic_patch is not None
-            else getattr(config, "max_dynamic_patch", 12)
-        )
-        self.dynamic_image_size = (
-            dynamic_image_size
-            if dynamic_image_size is not None
-            else getattr(config, "dynamic_image_size", True)
-        )
-        self.use_thumbnail: bool = getattr(config, "use_thumbnail", True)
-
-    @property
-    def image_token_id(self) -> int:
-        """Get the image token ID from config or tokenizer."""
-        if hasattr(self.config, "image_token_index"):
-            return self.config.image_token_index
-        # Fallback to tokenizer vocab - use <IMG_CONTEXT> (ID: 151667)
-        vocab = self.tokenizer.get_vocab()
-        if IMG_CONTEXT in vocab:
-            return vocab[IMG_CONTEXT]
-        raise ValueError(f"Cannot find image token '{IMG_CONTEXT}' in vocabulary")
-
-    def get_image_repl(
-        self,
-        feature_size: int,
-        num_patches: int | None,
-    ) -> PromptUpdateDetails[str]:
-        """Get image replacement string for prompt."""
-        repl_features = IMG_CONTEXT * feature_size
-        repl_full = IMG_START + repl_features + IMG_END
-
-        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
-
-
 class Eagle2_5_VLProcessingInfo(BaseInternVLProcessingInfo):
    """Processing info for Eagle2.5-VL model."""

--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -11,7 +11,6 @@
 from collections.abc import Mapping, Sequence

 import torch
-from PIL import Image
 from transformers import PretrainedConfig

 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -27,391 +26,19 @@ from vllm.multimodal.processing.processor import (
    ProcessorInputs,
    PromptReplacement,
    PromptUpdate,
-    PromptUpdateDetails,
    TimingContext,
 )
-from vllm.tokenizers import TokenizerLike
+from vllm.transformers_utils.processors.h2ovl import H2OVLProcessor

 from .intern_vit import InternVisionModel
 from .internvl import (
-    IMG_CONTEXT,
-    IMG_END,
-    IMG_START,
    BaseInternVLDummyInputsBuilder,
    BaseInternVLMultiModalProcessor,
    BaseInternVLProcessingInfo,
-    BaseInternVLProcessor,
    InternVLChatModel,
-    build_transform,
-    find_closest_aspect_ratio,
-    get_internvl_target_ratios,
 )


-def resolve_h2ovl_min_max_num(
-    *,
-    min_dynamic_patch: int,
-    max_dynamic_patch: int,
-    dynamic_image_size: bool,
-    use_thumbnail: bool,
-) -> tuple[int, int]:
-    min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
-    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
-
-    if use_thumbnail and max_dynamic_patch != 1:
-        max_dynamic_patch += 1
-
-    return min_dynamic_patch, max_dynamic_patch
-
-
-def get_h2ovl_target_ratios(
-    min_num: int,
-    max_num: int,
-    *,
-    prior_aspect_ratio: tuple[int, int] | None,
-) -> list[tuple[int, int]]:
-    target_ratios = get_internvl_target_ratios(min_num, max_num)
-
-    # if prior_aspect_ratio is provided, filter the target ratios
-    if prior_aspect_ratio is not None:
-        target_ratios = [
-            ratio
-            for ratio in target_ratios
-            if prior_aspect_ratio[0] % ratio[0] != 0
-            and prior_aspect_ratio[1] % ratio[1] != 0
-        ]
-
-    return target_ratios
-
-
-# modified to include blocks generated in second pass
-def calculate_h2ovl_targets(
-    *,
-    orig_width: int,
-    orig_height: int,
-    target_ratios: list[tuple[int, int]],
-    image_size: int,
-    use_thumbnail: bool,
-) -> tuple[int, int, int, tuple[int, int]]:
-    aspect_ratio = orig_width / orig_height
-
-    # find the closest aspect ratio to the target
-    target_aspect_ratio = find_closest_aspect_ratio(
-        aspect_ratio,
-        target_ratios,
-        width=orig_width,
-        height=orig_height,
-        image_size=image_size,
-    )
-
-    # calculate the target width and height
-    target_width = image_size * target_aspect_ratio[0]
-    target_height = image_size * target_aspect_ratio[1]
-    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
-
-    # add thumbnail image if num_blocks != 1
-    if use_thumbnail and blocks != 1:
-        blocks += 1
-
-    return blocks, target_width, target_height, target_aspect_ratio
-
-
-# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
-# refactored to handle prior_aspect_ratio
-def dynamic_preprocess_h2ovl(
-    image: Image.Image,
-    *,
-    target_ratios: list[tuple[int, int]],
-    image_size: int,
-    use_thumbnail: bool,
-) -> tuple[list[Image.Image], tuple[int, int]]:
-    orig_width, orig_height = image.size
-
-    # calculate the number of blocks without thumbnail
-    (
-        blocks,
-        target_width,
-        target_height,
-        target_aspect_ratio,
-    ) = calculate_h2ovl_targets(
-        orig_width=orig_width,
-        orig_height=orig_height,
-        target_ratios=target_ratios,
-        image_size=image_size,
-        use_thumbnail=False,
-    )
-
-    # resize the image
-    resized_img = image.resize((target_width, target_height))
-    processed_images = []
-    for i in range(blocks):
-        box = (
-            (i % (target_width // image_size)) * image_size,
-            (i // (target_width // image_size)) * image_size,
-            ((i % (target_width // image_size)) + 1) * image_size,
-            ((i // (target_width // image_size)) + 1) * image_size,
-        )
-        # split the image
-        split_img = resized_img.crop(box)
-        processed_images.append(split_img)
-
-    assert len(processed_images) == blocks
-
-    if use_thumbnail and len(processed_images) != 1:
-        thumbnail_img = image.resize((image_size, image_size))
-        processed_images.append(thumbnail_img)
-
-    return processed_images, target_aspect_ratio
-
-
-def _preprocess_image(
-    image: Image.Image,
-    *,
-    input_size: int,
-    min_num: int,
-    max_num: int,
-    use_thumbnail: bool,
-    prior_aspect_ratio: tuple[int, int] | None,
-) -> tuple[torch.Tensor, tuple[int, int]]:
-    target_ratios = get_h2ovl_target_ratios(
-        min_num,
-        max_num,
-        prior_aspect_ratio=prior_aspect_ratio,
-    )
-
-    transform = build_transform(input_size=input_size)
-    images, target_aspect_ratio = dynamic_preprocess_h2ovl(
-        image,
-        image_size=input_size,
-        use_thumbnail=use_thumbnail,
-        target_ratios=target_ratios,
-    )
-
-    pixel_values = torch.stack([transform(image) for image in images])
-    return pixel_values, target_aspect_ratio
-
-
-# refactored to use the _preprocess_image function
-def image_to_pixel_values_h2ovl(
-    image: Image.Image,
-    *,
-    input_size: int,
-    min_num: int,
-    max_num: int,
-    use_thumbnail: bool,
-    use_msac: bool,
-) -> torch.Tensor:
-    # when MSAC is turned on, we need to process the image twice
-    if use_msac:
-        # first pass
-        pixel_values1, aspect_ratio1 = _preprocess_image(
-            image,
-            input_size=input_size,
-            min_num=1,
-            max_num=max_num,
-            use_thumbnail=True,
-            prior_aspect_ratio=None,
-        )
-        # second pass
-        pixel_values2, _ = _preprocess_image(
-            image,
-            input_size=input_size,
-            min_num=3,
-            max_num=max_num,
-            use_thumbnail=True,
-            prior_aspect_ratio=aspect_ratio1,
-        )
-        # combine pixel values
-        pixel_values = torch.cat(
-            [pixel_values2[:-1], pixel_values1[:-1], pixel_values2[-1:]], 0
-        )
-
-    else:
-        pixel_values, _ = _preprocess_image(
-            image,
-            input_size=input_size,
-            min_num=min_num,
-            max_num=max_num,
-            use_thumbnail=use_thumbnail,
-            prior_aspect_ratio=None,
-        )
-
-    return pixel_values
-
-
-class H2OVLProcessor(BaseInternVLProcessor):
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        tokenizer: TokenizerLike,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-        use_msac: bool | None = None,
-    ) -> None:
-        super().__init__(
-            config,
-            tokenizer,
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-        )
-
-        if use_msac is None:
-            use_msac = config.use_msac
-        assert isinstance(use_msac, bool)
-
-        self.use_msac = use_msac
-
-    @property
-    def image_token_id(self) -> int:
-        return self.tokenizer.get_vocab()[IMG_CONTEXT]
-
-    def get_image_repl(
-        self,
-        feature_size: int,
-        num_patches: int | None,
-    ) -> PromptUpdateDetails[str]:
-        repl_features = IMG_CONTEXT * feature_size
-        repl_full = IMG_START + repl_features + IMG_END
-
-        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
-
-    def resolve_min_max_num(
-        self,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-        use_thumbnail: bool | None = None,
-    ) -> tuple[int, int]:
-        min_dynamic_patch = (
-            self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
-        )
-        max_dynamic_patch = (
-            self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
-        )
-        dynamic_image_size = (
-            self.dynamic_image_size
-            if dynamic_image_size is None
-            else dynamic_image_size
-        )
-        use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
-
-        return resolve_h2ovl_min_max_num(
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-            use_thumbnail=use_thumbnail,
-        )
-
-    def resolve_target_ratios(
-        self,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-        use_thumbnail: bool | None = None,
-        prior_aspect_ratio: tuple[int, int] | None = None,
-        override_min_num: int | None = None,
-    ) -> list[tuple[int, int]]:
-        min_num, max_num = self.resolve_min_max_num(
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-            use_thumbnail=use_thumbnail,
-        )
-        if override_min_num is not None:
-            min_num = override_min_num
-
-        return get_h2ovl_target_ratios(
-            min_num,
-            max_num,
-            prior_aspect_ratio=prior_aspect_ratio,
-        )
-
-    def get_num_image_tokens(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-        use_msac: bool | None = None,
-    ) -> int:
-        use_msac = self.use_msac if use_msac is None else use_msac
-
-        use_thumbnail = self.use_thumbnail
-
-        if use_msac:
-            target_ratios_1 = self.resolve_target_ratios(
-                use_thumbnail=False,  # Applied in calculate_targets
-                override_min_num=1,
-            )
-            num_patches_1, _, _, aspect_ratio_1 = calculate_h2ovl_targets(
-                orig_width=image_width,
-                orig_height=image_height,
-                image_size=self.image_size,
-                target_ratios=target_ratios_1,
-                use_thumbnail=True,
-            )
-
-            target_ratios_2 = self.resolve_target_ratios(
-                use_thumbnail=False,  # Applied in calculate_targets
-                prior_aspect_ratio=aspect_ratio_1,
-                override_min_num=3,
-            )
-            num_patches_2, _, _, _ = calculate_h2ovl_targets(
-                orig_width=image_width,
-                orig_height=image_height,
-                image_size=self.image_size,
-                target_ratios=target_ratios_2,
-                use_thumbnail=True,
-            )
-
-            num_patches = num_patches_1 + num_patches_2 - 1
-        else:
-            target_ratios = self.resolve_target_ratios(
-                use_thumbnail=False,  # Applied in calculate_targets
-            )
-            num_patches, _, _, _ = calculate_h2ovl_targets(
-                orig_width=image_width,
-                orig_height=image_height,
-                image_size=self.image_size,
-                target_ratios=target_ratios,
-                use_thumbnail=use_thumbnail,
-            )
-
-        return num_patches * self.num_image_token
-
-    def _images_to_pixel_values_lst(
-        self,
-        images: list[Image.Image],
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-    ) -> list[torch.Tensor]:
-        use_msac = self.use_msac if len(images) == 1 else False
-
-        min_num, max_num = self.resolve_min_max_num(
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-            use_thumbnail=False,  # Applied in image_to_pixel_values
-        )
-
-        return [
-            image_to_pixel_values_h2ovl(
-                image,
-                input_size=self.image_size,
-                min_num=min_num,
-                max_num=max_num,
-                use_thumbnail=self.use_thumbnail,
-                use_msac=use_msac,
-            )
-            for image in images
-        ]
-
-
 class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
    def get_hf_processor(self, **kwargs: object) -> H2OVLProcessor:
        return self.ctx.init_processor(
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -7,16 +7,13 @@
 # Copyright (c) 2023 OpenGVLab
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
-from abc import ABC, abstractmethod
+from abc import abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Any, Literal, TypeAlias, TypeVar
+from typing import Annotated, Literal, TypeAlias, TypeVar

-import numpy.typing as npt
 import torch
 import torch.nn as nn
-import torchvision.transforms as T
-from PIL import Image
-from transformers import BatchFeature, PretrainedConfig, TensorType
+from transformers import BatchFeature, PretrainedConfig

 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
@@ -28,7 +25,6 @@ from vllm.model_executor.models.intern_vit import (
 )
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import convert_image_mode
 from vllm.multimodal.inputs import (
    MultiModalDataDict,
    MultiModalFieldConfig,
@@ -46,10 +42,12 @@ from vllm.multimodal.processing import (
    BaseProcessingInfo,
    PromptReplacement,
    PromptUpdate,
-    PromptUpdateDetails,
 )
 from vllm.sequence import IntermediateTensors
-from vllm.tokenizers import TokenizerLike
+from vllm.transformers_utils.processors.internvl import (
+    BaseInternVLProcessor,
+    InternVLProcessor,
+)
 from vllm.utils.tensor_schema import TensorSchema, TensorShape

 from .interfaces import (
@@ -60,13 +58,6 @@ from .interfaces import (
 )
 from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix

-IMG_START = "<img>"
-IMG_END = "</img>"
-IMG_CONTEXT = "<IMG_CONTEXT>"
-
-IMAGENET_MEAN = (0.485, 0.456, 0.406)
-IMAGENET_STD = (0.229, 0.224, 0.225)
-

 class InternVLImagePixelInputs(TensorSchema):
    """
@@ -128,568 +119,6 @@ class InternVLVideoEmbeddingInputs(TensorSchema):
 InternVLVideoInputs: TypeAlias = InternVLVideoPixelInputs | InternVLVideoEmbeddingInputs


-# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
-def build_transform(input_size: int):
-    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
-    transform = T.Compose(
-        [
-            T.Lambda(lambda img: convert_image_mode(img, "RGB")),
-            T.Resize(
-                (input_size, input_size), interpolation=T.InterpolationMode.BICUBIC
-            ),
-            T.ToTensor(),
-            T.Normalize(mean=MEAN, std=STD),
-        ]
-    )
-    return transform
-
-
-# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
-def find_closest_aspect_ratio(
-    aspect_ratio: float,
-    target_ratios: list[tuple[int, int]],
-    *,
-    width: int,
-    height: int,
-    image_size: int,
-) -> tuple[int, int]:
-    best_ratio_diff = float("inf")
-    best_ratio = (1, 1)
-    area = width * height
-    for ratio in target_ratios:
-        target_aspect_ratio = ratio[0] / ratio[1]
-        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
-        if ratio_diff < best_ratio_diff:
-            best_ratio_diff = ratio_diff
-            best_ratio = ratio
-        elif ratio_diff == best_ratio_diff:
-            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
-                best_ratio = ratio
-    return best_ratio
-
-
-def resolve_internvl_min_max_num(
-    *,
-    min_dynamic_patch: int,
-    max_dynamic_patch: int,
-    dynamic_image_size: bool,
-    use_thumbnail: bool,
-) -> tuple[int, int]:
-    min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
-    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
-
-    if use_thumbnail and max_dynamic_patch != 1:
-        max_dynamic_patch += 1
-
-    return min_dynamic_patch, max_dynamic_patch
-
-
-def get_internvl_target_ratios(
-    min_num: int,
-    max_num: int,
-) -> list[tuple[int, int]]:
-    target_ratios = {
-        (i, j)
-        for n in range(min_num, max_num + 1)
-        for i in range(1, n + 1)
-        for j in range(1, n + 1)
-        if min_num <= i * j <= max_num
-    }
-    return sorted(target_ratios, key=lambda x: x[0] * x[1])
-
-
-def calculate_internvl_targets(
-    *,
-    orig_width: int,
-    orig_height: int,
-    target_ratios: list[tuple[int, int]],
-    image_size: int,
-    use_thumbnail: bool,
-) -> tuple[int, int, int]:
-    aspect_ratio = orig_width / orig_height
-
-    # find the closest aspect ratio to the target
-    target_aspect_ratio = find_closest_aspect_ratio(
-        aspect_ratio,
-        target_ratios,
-        width=orig_width,
-        height=orig_height,
-        image_size=image_size,
-    )
-
-    # calculate the target width and height
-    target_width = image_size * target_aspect_ratio[0]
-    target_height = image_size * target_aspect_ratio[1]
-    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
-
-    # add thumbnail image if num_blocks != 1
-    if use_thumbnail and blocks != 1:
-        blocks += 1
-
-    return blocks, target_width, target_height
-
-
-# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
-def dynamic_preprocess_internvl(
-    image: Image.Image,
-    *,
-    target_ratios: list[tuple[int, int]],
-    image_size: int,
-    use_thumbnail: bool,
-) -> list[Image.Image]:
-    orig_width, orig_height = image.size
-
-    # calculate the number of blocks without thumbnail
-    blocks, target_width, target_height = calculate_internvl_targets(
-        orig_width=orig_width,
-        orig_height=orig_height,
-        target_ratios=target_ratios,
-        image_size=image_size,
-        use_thumbnail=False,
-    )
-
-    # resize the image
-    resized_img = image.resize((target_width, target_height))
-    processed_images = []
-    for i in range(blocks):
-        box = (
-            (i % (target_width // image_size)) * image_size,
-            (i // (target_width // image_size)) * image_size,
-            ((i % (target_width // image_size)) + 1) * image_size,
-            ((i // (target_width // image_size)) + 1) * image_size,
-        )
-        # split the image
-        split_img = resized_img.crop(box)
-        processed_images.append(split_img)
-
-    assert len(processed_images) == blocks
-
-    if use_thumbnail and len(processed_images) != 1:
-        thumbnail_img = image.resize((image_size, image_size))
-        processed_images.append(thumbnail_img)
-
-    return processed_images
-
-
-# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
-def image_to_pixel_values_internvl(
-    image: Image.Image,
-    *,
-    input_size: int,
-    min_num: int,
-    max_num: int,
-    use_thumbnail: bool,
-) -> torch.Tensor:
-    target_ratios = get_internvl_target_ratios(min_num, max_num)
-
-    transform = build_transform(input_size=input_size)
-    images = dynamic_preprocess_internvl(
-        image,
-        target_ratios=target_ratios,
-        image_size=input_size,
-        use_thumbnail=use_thumbnail,
-    )
-
-    pixel_values = torch.stack([transform(image) for image in images])
-    return pixel_values
-
-
-# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
-def video_to_pixel_values_internvl(
-    video: npt.NDArray,
-    *,
-    input_size: int,
-    min_num: int,
-    max_num: int,
-    use_thumbnail: bool,
-) -> torch.Tensor:
-    target_ratios = get_internvl_target_ratios(min_num, max_num)
-
-    transform = build_transform(input_size=input_size)
-    frames_list = list[Image.Image]()
-    for frame in video:
-        pil_frame = dynamic_preprocess_internvl(
-            Image.fromarray(frame, mode="RGB"),
-            target_ratios=target_ratios,
-            image_size=input_size,
-            use_thumbnail=use_thumbnail,
-        )
-        assert len(pil_frame) == 1
-        frames_list.extend(pil_frame)
-
-    pixel_values = torch.stack([transform(image) for image in frames_list])
-    return pixel_values
-
-
-class BaseInternVLProcessor(ABC):
-    """
-    This model doesn't define its own HF processor,
-    so we implement our own one here.
-
-    The code to insert image tokens is based on:
-    https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252
-    """
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        tokenizer: TokenizerLike,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-    ) -> None:
-        super().__init__()
-
-        self.config = config
-        self.tokenizer = tokenizer
-
-        image_size: int = config.vision_config.image_size
-        patch_size: int = config.vision_config.patch_size
-
-        if min_dynamic_patch is None:
-            min_dynamic_patch = config.min_dynamic_patch
-        assert isinstance(min_dynamic_patch, int)
-
-        if max_dynamic_patch is None:
-            max_dynamic_patch = config.max_dynamic_patch
-        assert isinstance(max_dynamic_patch, int)
-
-        if dynamic_image_size is None:
-            dynamic_image_size = config.dynamic_image_size
-        assert isinstance(dynamic_image_size, bool)
-
-        self.num_image_token = int(
-            (image_size // patch_size) ** 2 * (config.downsample_ratio**2)
-        )
-        self.image_size = image_size
-        self.min_dynamic_patch = min_dynamic_patch
-        self.max_dynamic_patch = max_dynamic_patch
-        self.dynamic_image_size = dynamic_image_size
-        self.use_thumbnail: bool = config.use_thumbnail
-
-    @property
-    @abstractmethod
-    def image_token_id(self) -> int:
-        raise NotImplementedError
-
-    @abstractmethod
-    def get_image_repl(
-        self,
-        feature_size: int,
-        num_patches: int | None,
-    ) -> PromptUpdateDetails[str]:
-        raise NotImplementedError
-
-    def resolve_min_max_num(
-        self,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-        use_thumbnail: bool | None = None,
-    ) -> tuple[int, int]:
-        min_dynamic_patch = (
-            self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
-        )
-        max_dynamic_patch = (
-            self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
-        )
-        dynamic_image_size = (
-            self.dynamic_image_size
-            if dynamic_image_size is None
-            else dynamic_image_size
-        )
-        use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
-
-        return resolve_internvl_min_max_num(
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-            use_thumbnail=use_thumbnail,
-        )
-
-    def resolve_target_ratios(
-        self,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-        use_thumbnail: bool | None = None,
-    ) -> list[tuple[int, int]]:
-        min_num, max_num = self.resolve_min_max_num(
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-            use_thumbnail=use_thumbnail,
-        )
-
-        return get_internvl_target_ratios(min_num, max_num)
-
-    def get_num_image_tokens(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-    ) -> int:
-        target_ratios = self.resolve_target_ratios(
-            use_thumbnail=False,  # Applied in calculate_targets
-        )
-
-        num_patches, _, _ = calculate_internvl_targets(
-            orig_width=image_width,
-            orig_height=image_height,
-            image_size=self.image_size,
-            target_ratios=target_ratios,
-            use_thumbnail=self.use_thumbnail,
-        )
-
-        return num_patches * self.num_image_token
-
-    def _images_to_pixel_values_lst(
-        self,
-        images: list[Image.Image],
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-    ) -> list[torch.Tensor]:
-        min_num, max_num = self.resolve_min_max_num(
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-            use_thumbnail=False,  # Applied in image_to_pixel_values
-        )
-
-        return [
-            image_to_pixel_values_internvl(
-                image,
-                input_size=self.image_size,
-                min_num=min_num,
-                max_num=max_num,
-                use_thumbnail=self.use_thumbnail,
-            )
-            for image in images
-        ]
-
-    def _preprocess_image(
-        self,
-        text: list[str],
-        images: list[Image.Image],
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-    ) -> tuple[list[str], dict[str, torch.Tensor]]:
-        if len(images) == 0:
-            image_inputs = {}
-        else:
-            pixel_values_lst = self._images_to_pixel_values_lst(
-                images,
-                min_dynamic_patch=min_dynamic_patch,
-                max_dynamic_patch=max_dynamic_patch,
-                dynamic_image_size=dynamic_image_size,
-            )
-            image_inputs = {
-                "pixel_values_flat": torch.cat(pixel_values_lst),
-                "image_num_patches": torch.tensor(
-                    [len(item) for item in pixel_values_lst]
-                ),
-            }
-
-            for pixel_values in pixel_values_lst:
-                num_patches = pixel_values.shape[0]
-                feature_size = num_patches * self.num_image_token
-
-                image_repl = self.get_image_repl(feature_size, num_patches)
-                text = [t.replace("<image>", image_repl.full, 1) for t in text]
-        return text, image_inputs
-
-    def _make_batch_input(self, input_item: Any | list[Any] | None = None):
-        if input_item is None:
-            input_item = []
-        if not isinstance(input_item, list):
-            input_item = [input_item]
-        return input_item
-
-    def __call__(
-        self,
-        text: str | list[str] | None = None,
-        images: Image.Image | list[Image.Image] | None = None,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-        return_tensors: str | TensorType | None = None,
-    ) -> BatchFeature:
-        text, images = [self._make_batch_input(x) for x in (text, images)]
-
-        text, image_inputs = self._preprocess_image(
-            text=text,
-            images=images,
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-        )
-
-        text_inputs = self.tokenizer(text)
-
-        combined_outputs = {**text_inputs, **image_inputs}
-
-        return BatchFeature(combined_outputs, tensor_type=return_tensors)
-
-
-class InternVLProcessor(BaseInternVLProcessor):
-    """
-    HF Processor for InternVLChatModel with extended video processing logic.
-
-    Code for video processing is adapted from video example:
-    https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers
-    """
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        tokenizer: TokenizerLike,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-        video_token: str | None = None,
-    ) -> None:
-        super().__init__(
-            config=config,
-            tokenizer=tokenizer,
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-        )
-        # add extra video token for video processing
-        self.video_token = video_token
-
-    @property
-    def image_token_id(self) -> int:
-        return self.tokenizer.get_vocab()[IMG_CONTEXT]
-
-    @property
-    def video_token_id(self) -> int | None:
-        if self.video_token is None:
-            return None
-        return self.tokenizer.get_vocab().get(self.video_token, None)
-
-    @property
-    def supports_video(self) -> bool:
-        return self.video_token_id is not None
-
-    def _videos_to_pixel_values_lst(
-        self,
-        videos: list[npt.NDArray],
-        dynamic_image_size: bool | None = None,
-    ) -> list[torch.Tensor]:
-        min_num, max_num = self.resolve_min_max_num(
-            min_dynamic_patch=1,
-            max_dynamic_patch=1,
-            dynamic_image_size=dynamic_image_size,
-            use_thumbnail=False,  # Applied in image_to_pixel_values
-        )
-
-        return [
-            video_to_pixel_values_internvl(
-                video,
-                input_size=self.image_size,
-                min_num=min_num,
-                max_num=max_num,
-                use_thumbnail=False,
-            )
-            for video in videos
-        ]
-
-    def _preprocess_video(
-        self,
-        text: list[str],
-        videos: list[npt.NDArray],
-        dynamic_image_size: bool | None = None,
-    ):
-        if len(videos) == 0 or not self.supports_video:
-            video_inputs = {}
-        else:
-            pixel_values_lst_video = self._videos_to_pixel_values_lst(
-                videos,
-                dynamic_image_size=dynamic_image_size,
-            )
-            video_inputs = {
-                "pixel_values_flat_video": torch.cat(pixel_values_lst_video),
-                "video_num_patches": torch.tensor(
-                    [len(item) for item in pixel_values_lst_video]
-                ),
-            }
-
-            for pixel_values in pixel_values_lst_video:
-                num_patches = pixel_values.shape[0]
-
-                video_repl = self.get_video_repl(
-                    self.num_image_token, num_patches, self.video_token
-                )
-                text = [t.replace("<video>", video_repl.full, 1) for t in text]
-        return text, video_inputs
-
-    def __call__(
-        self,
-        text: str | list[str] | None = None,
-        images: Image.Image | list[Image.Image] | None = None,
-        videos: npt.NDArray | list[npt.NDArray] | None = None,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-        return_tensors: str | TensorType | None = None,
-    ) -> BatchFeature:
-        text, images, videos = [
-            self._make_batch_input(x) for x in (text, images, videos)
-        ]
-
-        text, image_inputs = self._preprocess_image(
-            text=text,
-            images=images,
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-        )
-
-        text, video_inputs = self._preprocess_video(
-            text=text,
-            videos=videos,
-            dynamic_image_size=dynamic_image_size,
-        )
-
-        text_inputs = self.tokenizer(text)
-
-        combined_outputs = {**text_inputs, **image_inputs, **video_inputs}
-
-        return BatchFeature(combined_outputs, tensor_type=return_tensors)
-
-    def get_image_repl(
-        self,
-        feature_size: int,
-        num_patches: int | None,
-    ) -> PromptUpdateDetails[str]:
-        repl_features = IMG_CONTEXT * feature_size
-        repl_full = IMG_START + repl_features + IMG_END
-
-        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
-
-    def get_video_repl(
-        self,
-        feature_size: int,
-        num_patches: int | None = None,
-        video_context_token: str = IMG_CONTEXT,
-    ) -> PromptUpdateDetails[str]:
-        repl_features = video_context_token * self.num_image_token
-        repl_features_with_sep = IMG_START + repl_features + IMG_END
-        # num_patches is equal to num_frames
-        repl_full = "".join(
-            [f"Frame{i + 1}: {repl_features_with_sep}" for i in range(num_patches)]
-        )
-
-        return PromptUpdateDetails.select_text(repl_full, video_context_token)
-
-
 class BaseInternVLProcessingInfo(BaseProcessingInfo):
    """Basic image-only ProcessingInfo for InternVL-style models."""

--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
--- a/vllm/model_executor/models/nemotron_parse.py
+++ b/vllm/model_executor/models/nemotron_parse.py
@@ -11,18 +11,13 @@ import math
 from collections.abc import Iterable, Mapping, Sequence
 from typing import Annotated, Literal

-import numpy as np
 import torch
 import torch.nn as nn
 from einops import rearrange
-from PIL import Image
-from timm.data.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
-from torchvision import transforms as T
 from transformers import (
    BartConfig,
    BatchFeature,
    PretrainedConfig,
-    TensorType,
 )

 from vllm.config import CacheConfig, VllmConfig
@@ -59,13 +54,12 @@ from vllm.multimodal.processing import (
    PromptUpdate,
 )
 from vllm.renderers import TokenizeParams
-from vllm.tokenizers import TokenizerLike
 from vllm.transformers_utils.configs.radio import RadioConfig
+from vllm.transformers_utils.processors.nemotron_parse import NemotronParseProcessor
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from vllm.v1.attention.backend import AttentionType

 logger = init_logger(__name__)
-DEFAULT_FINAL_IMAGE_SIZE = (2048, 1648)


 class BartScaledWordEmbedding(VocabParallelEmbedding):
@@ -372,231 +366,6 @@ class NemotronParsePixelInputs(TensorSchema):
    data: Annotated[torch.Tensor, TensorShape("b", 3, "h", "w")]


-class NemotronParseImageProcessor:
-    """
-    NemotronParse Image Processor
-    """
-
-    def __init__(
-        self,
-        final_size: tuple = DEFAULT_FINAL_IMAGE_SIZE,
-        **kwargs,
-    ):
-        # Ensure final_size is properly formatted
-        if isinstance(final_size, (list, tuple)) and len(final_size) >= 2:
-            self.final_size = (int(final_size[0]), int(final_size[1]))
-        elif isinstance(final_size, (int, float)):
-            self.final_size = (int(final_size), int(final_size))
-        else:
-            self.final_size = DEFAULT_FINAL_IMAGE_SIZE  # Default fallback
-
-        self.norm_mean = torch.Tensor(OPENAI_CLIP_MEAN).reshape(1, 3, 1, 1)
-        self.norm_std = torch.Tensor(OPENAI_CLIP_STD).reshape(1, 3, 1, 1)
-
-        # Create transforms
-        self._create_transforms()
-
-    def _create_transforms(self):
-        """Create transform objects."""
-        try:
-            import albumentations as A
-        except ImportError as err:
-            raise ImportError(
-                "The package `albumentations` is required to use "
-                "NemotronParse model. Please install it with `pip install "
-                "albumentations`."
-            ) from err
-
-        # Ensure final_size is a tuple of integers
-        if isinstance(self.final_size, (list, tuple)):
-            self.target_height, self.target_width = (
-                int(self.final_size[0]),
-                int(self.final_size[1]),
-            )
-        else:
-            self.target_height = self.target_width = int(self.final_size)
-
-        import cv2
-
-        self.transform = A.Compose(
-            [
-                A.PadIfNeeded(
-                    min_height=self.target_height,
-                    min_width=self.target_width,
-                    border_mode=cv2.BORDER_CONSTANT,
-                    fill=[255, 255, 255],
-                    p=1.0,
-                ),
-            ]
-        )
-
-        self.torch_transform = T.Compose(
-            [
-                T.ToTensor(),
-            ]
-        )
-
-    def _resize_with_aspect_ratio(self, image: np.ndarray) -> np.ndarray:
-        """Resize image maintaining aspect ratio (exact replica of original
-        LongestMaxSizeHW)."""
-        height, width = image.shape[:2]
-        max_size_height = self.target_height
-        max_size_width = self.target_width
-
-        # Original LongestMaxSizeHW algorithm from custom_augmentations.py
-        aspect_ratio = width / height
-        new_height = height
-        new_width = width
-
-        # If height too big then scale image down
-        if height > max_size_height:
-            new_height = max_size_height
-            new_width = int(new_height * aspect_ratio)
-
-        # If width too big, scale image down further
-        if new_width > max_size_width:
-            new_width = max_size_width
-            new_height = int(new_width / aspect_ratio)
-
-        # Use cv2.INTER_LINEAR like the original
-        import cv2
-
-        return cv2.resize(
-            image, (new_width, new_height), interpolation=cv2.INTER_LINEAR
-        )
-
-    def _pad_to_size(self, image: np.ndarray) -> np.ndarray:
-        """Pad image to target size with white padding (matches A.PadIfNeeded
-        behavior)."""
-        h, w = image.shape[:2]
-        min_height, min_width = self.target_height, self.target_width
-
-        # Only pad if image is smaller than target (matches A.PadIfNeeded logic)
-        pad_h = max(0, min_height - h)
-        pad_w = max(0, min_width - w)
-
-        if pad_h == 0 and pad_w == 0:
-            return image
-
-        # A.PadIfNeeded pads to bottom-right with constant value
-        if len(image.shape) == 3:
-            # Color image - pad bottom and right with white (255, 255, 255)
-            padded = np.pad(
-                image,
-                ((0, pad_h), (0, pad_w), (0, 0)),
-                mode="constant",
-                constant_values=255,
-            )
-        else:
-            # Grayscale image - pad with white (255)
-            padded = np.pad(
-                image, ((0, pad_h), (0, pad_w)), mode="constant", constant_values=255
-            )
-
-        return padded
-
-    def preprocess(
-        self,
-        images: Image.Image | list[Image.Image],
-        **kwargs,
-    ) -> dict[str, torch.Tensor]:
-        """
-        Preprocess an image or batch of images for the NemotronParse model.
-
-        Args:
-            images: Input image(s)
-        """
-        # Ensure images is a list
-        if not isinstance(images, list):
-            images = [images]
-
-        # Convert PIL images to numpy arrays if needed
-        processed_images = []
-        for image in images:
-            if isinstance(image, Image.Image):
-                image = np.asarray(image)
-            processed_images.append(image)
-
-        # Apply NemotronParse-specific transforms
-        pixel_values = []
-        for image in processed_images:
-            # Manual resize with aspect ratio preservation
-            # (replaces LongestMaxSizeHW)
-            processed_image = self._resize_with_aspect_ratio(image)
-
-            # Apply remaining albumentations transforms if available
-            if self.transform is not None:
-                transformed = self.transform(image=processed_image)
-                processed_image = transformed["image"]
-            else:
-                # Fallback: just pad to target size
-                processed_image = self._pad_to_size(processed_image)
-
-            # Convert to tensor
-            pixel_values_tensor = self.torch_transform(processed_image)
-
-            # Handle grayscale images
-            if pixel_values_tensor.shape[0] == 1:
-                pixel_values_tensor = pixel_values_tensor.expand(3, -1, -1)
-
-            pixel_values.append(pixel_values_tensor)
-
-        # Stack into batch
-        pixel_values = torch.stack(pixel_values)
-
-        # Normalize pixel values
-        normalized_values = (pixel_values - self.norm_mean) / self.norm_std
-        return {"pixel_values": normalized_values}
-
-    def __call__(
-        self, images: Image.Image | list[Image.Image], **kwargs
-    ) -> dict[str, torch.Tensor]:
-        return self.preprocess(images, **kwargs)
-
-
-class NemotronParseProcessor:
-    """
-    NemotronParse Processor
-    """
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        tokenizer: TokenizerLike,
-        **kwargs,
-    ) -> None:
-        super().__init__()
-
-        self.config = config
-        self.tokenizer = tokenizer
-
-        self.image_processor = NemotronParseImageProcessor(final_size=config.image_size)
-
-    def _make_batch_input(self, input_item=None):
-        if input_item is None:
-            input_item = []
-        if not isinstance(input_item, list):
-            input_item = [input_item]
-        return input_item
-
-    def __call__(
-        self,
-        text: str | None = None,
-        images: Image.Image | list[Image.Image] | None = None,
-        return_tensors: str | TensorType | None = None,
-        **kwargs,
-    ) -> BatchFeature:
-        text, images = [self._make_batch_input(x) for x in (text, images)]
-        image_inputs = {} if len(images) == 0 else self.image_processor(images)
-
-        text_inputs = self.tokenizer(text, add_special_tokens=False, **kwargs)
-        combined_outputs = BatchFeature(
-            data={**text_inputs, **image_inputs},
-            tensor_type=return_tensors,
-        )
-        return combined_outputs
-
-
 class NemotronParseProcessingInfo(BaseProcessingInfo):
    def get_hf_config(self):
        return self.ctx.get_hf_config()
--- a/vllm/model_executor/models/nemotron_vl.py
+++ b/vllm/model_executor/models/nemotron_vl.py
@@ -1,22 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_internvl_chat.py
-# --------------------------------------------------------
-# InternVL
-# Copyright (c) 2023 OpenGVLab
-# Licensed under The MIT License [see LICENSE for details]
-# --------------------------------------------------------
 import math
-from abc import ABC
 from collections.abc import Iterable

 import torch
 import torch.nn as nn
-import torchvision.transforms as T
-from PIL import Image
 from transformers import AutoModel, PretrainedConfig
-from transformers.image_processing_utils_fast import BaseImageProcessorFast

 from vllm.config import VllmConfig
 from vllm.model_executor.layers.linear import ReplicatedLinear
@@ -30,16 +19,16 @@ from vllm.model_executor.models.internvl import (
    InternVLImageEmbeddingInputs,
    InternVLImageInputs,
    InternVLImagePixelInputs,
-    InternVLProcessor,
 )
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.siglip import SiglipVisionModel
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import convert_image_mode
-from vllm.multimodal.processing import PromptUpdateDetails
 from vllm.sequence import IntermediateTensors
-from vllm.tokenizers import TokenizerLike
 from vllm.transformers_utils.processor import cached_image_processor_from_config
+from vllm.transformers_utils.processors.nemotron_vl import (
+    LlamaNemotronVLEmbedProcessor,
+    NemotronVLProcessor,
+)
 from vllm.transformers_utils.repo_utils import get_hf_file_to_dict

 from .interfaces import (
@@ -58,310 +47,6 @@ from .utils import (
 )


-def build_transform(input_size: int):
-    return T.Compose(
-        [
-            T.Lambda(lambda img: convert_image_mode(img, "RGB")),
-            T.Resize(
-                (input_size, input_size), interpolation=T.InterpolationMode.BICUBIC
-            ),
-            T.ToTensor(),
-        ]
-    )
-
-
-# adapted from https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1
-def find_closest_aspect_ratio(
-    aspect_ratio: float,
-    target_ratios: list[tuple[int, int]],
-    *,
-    width: int,
-    height: int,
-    image_size: int,
-) -> tuple[int, int]:
-    best_factor = float("-inf")
-    best_ratio = (1, 1)
-    area = width * height
-
-    for rw, rh in target_ratios:
-        target_aspect_ratio = rw / rh
-        size_factor = min((rw * rh * image_size * image_size) / area, 0.6)
-        ratio_closeness = min(
-            target_aspect_ratio / aspect_ratio, aspect_ratio / target_aspect_ratio
-        )
-        factor = size_factor * ratio_closeness
-
-        if factor > best_factor:
-            best_factor = factor
-            best_ratio = (rw, rh)
-
-    return best_ratio
-
-
-def calculate_nemotron_vl_targets(
-    *,
-    orig_width: int,
-    orig_height: int,
-    target_ratios: list[tuple[int, int]],
-    image_size: int,
-    use_thumbnail: bool,
-) -> tuple[int, int, int]:
-    aspect_ratio = orig_width / orig_height
-
-    # find the closest aspect ratio to the target
-    target_aspect_ratio = find_closest_aspect_ratio(
-        aspect_ratio,
-        target_ratios,
-        width=orig_width,
-        height=orig_height,
-        image_size=image_size,
-    )
-
-    # calculate the target width and height
-    target_width = image_size * target_aspect_ratio[0]
-    target_height = image_size * target_aspect_ratio[1]
-    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
-
-    # add thumbnail image if num_blocks != 1
-    if use_thumbnail and blocks != 1:
-        blocks += 1
-
-    return blocks, target_width, target_height
-
-
-def dynamic_preprocess_nemotron_vl(
-    image: Image.Image,
-    *,
-    target_ratios: list[tuple[int, int]],
-    image_size: int,
-    use_thumbnail: bool,
-) -> list[Image.Image]:
-    orig_width, orig_height = image.size
-
-    # calculate the number of blocks without thumbnail
-    blocks, target_width, target_height = calculate_nemotron_vl_targets(
-        orig_width=orig_width,
-        orig_height=orig_height,
-        target_ratios=target_ratios,
-        image_size=image_size,
-        use_thumbnail=False,
-    )
-
-    # resize the image
-    resized_img = image.resize((target_width, target_height))
-    processed_images = []
-    for i in range(blocks):
-        box = (
-            (i % (target_width // image_size)) * image_size,
-            (i // (target_width // image_size)) * image_size,
-            ((i % (target_width // image_size)) + 1) * image_size,
-            ((i // (target_width // image_size)) + 1) * image_size,
-        )
-        # split the image
-        split_img = resized_img.crop(box)
-        processed_images.append(split_img)
-
-    assert len(processed_images) == blocks
-
-    if use_thumbnail and len(processed_images) != 1:
-        thumbnail_img = image.resize((image_size, image_size))
-        processed_images.append(thumbnail_img)
-
-    return processed_images
-
-
-def get_nemotron_vl_target_ratios(
-    min_num: int,
-    max_num: int,
-) -> list[tuple[int, int]]:
-    target_ratios = {
-        (i, j)
-        for n in range(min_num, max_num + 1)
-        for i in range(1, n + 1)
-        for j in range(1, n + 1)
-        if min_num <= i * j <= max_num
-    }
-    return sorted(target_ratios, key=lambda x: x[0] * x[1])
-
-
-def image_to_pixel_values_nemotron_vl(
-    image: Image.Image,
-    *,
-    input_size: int,
-    min_num: int,
-    max_num: int,
-    use_thumbnail: bool,
-    transform: T.Compose | None = None,
-) -> torch.Tensor:
-    target_ratios = get_nemotron_vl_target_ratios(min_num, max_num)
-
-    if transform is None:
-        transform = build_transform(input_size=input_size)
-
-    images = dynamic_preprocess_nemotron_vl(
-        image,
-        target_ratios=target_ratios,
-        image_size=input_size,
-        use_thumbnail=use_thumbnail,
-    )
-
-    pixel_values = torch.stack([transform(image) for image in images])
-    return pixel_values
-
-
-class NemotronVLProcessor(InternVLProcessor):
-    IMG_START = "<img>"
-    IMG_END = "</img>"
-    IMG_CONTEXT = "<image>"
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        tokenizer: TokenizerLike,
-        image_processor: BaseImageProcessorFast | None = None,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-    ) -> None:
-        ABC.__init__(self)
-        self.config = config
-        self.tokenizer = tokenizer
-        self.image_processor = image_processor
-        image_size: int = config.force_image_size
-        patch_size: int = config.patch_size
-
-        if min_dynamic_patch is None:
-            min_dynamic_patch = 1
-        assert isinstance(min_dynamic_patch, int)
-
-        if max_dynamic_patch is None:
-            max_dynamic_patch = self.image_processor.max_num_tiles
-        assert isinstance(max_dynamic_patch, int)
-
-        if dynamic_image_size is None:
-            dynamic_image_size = True
-        assert isinstance(dynamic_image_size, bool)
-
-        self.num_image_token = int(
-            (image_size // patch_size) ** 2 * (config.downsample_ratio**2)
-        )
-        self.image_size = image_size
-        self.min_dynamic_patch = min_dynamic_patch
-        self.max_dynamic_patch = max_dynamic_patch
-        self.dynamic_image_size = dynamic_image_size
-
-        if image_processor is not None:
-            self.use_thumbnail = image_processor.use_thumbnail
-        else:
-            self.use_thumbnail = getattr(config, "use_thumbnail", True)
-
-    @property
-    def image_token_id(self) -> int:
-        return self.tokenizer.get_vocab()[self.IMG_CONTEXT]
-
-    def _get_transform(self) -> T.Compose:
-        return build_transform(input_size=self.image_size)
-
-    def get_num_image_tokens(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-    ) -> int:
-        target_ratios = self.resolve_target_ratios(
-            use_thumbnail=False,  # Applied in calculate_targets
-        )
-
-        num_patches, _, _ = calculate_nemotron_vl_targets(
-            orig_width=image_width,
-            orig_height=image_height,
-            image_size=self.image_size,
-            target_ratios=target_ratios,
-            use_thumbnail=self.use_thumbnail,
-        )
-
-        return num_patches * self.num_image_token
-
-    def _images_to_pixel_values_lst(
-        self,
-        images: list[Image.Image],
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-    ) -> list[torch.Tensor]:
-        min_num, max_num = self.resolve_min_max_num(
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-            use_thumbnail=False,  # Applied in image_to_pixel_values
-        )
-
-        return [
-            image_to_pixel_values_nemotron_vl(
-                image,
-                input_size=self.image_size,
-                min_num=min_num,
-                max_num=max_num,
-                use_thumbnail=self.use_thumbnail,
-                transform=self._get_transform(),
-            )
-            for image in images
-        ]
-
-    def _replace_image_tokens(
-        self,
-        text: list[str],
-        pixel_values_lst: list[torch.Tensor],
-    ) -> list[str]:
-        """Replace <image> placeholders with image tokens."""
-        for pixel_values in pixel_values_lst:
-            num_patches = pixel_values.shape[0]
-            feature_size = num_patches * self.num_image_token
-            image_repl = self.get_image_repl(feature_size, num_patches)
-            # Use temporary placeholder to avoid replacing tokens we just inserted
-            NVL_IMAGE_CONTEXT = image_repl.full.replace("<image>", "<NVL_IMG_CONTEXT>")
-            text = [t.replace("<image>", NVL_IMAGE_CONTEXT, 1) for t in text]
-        return [t.replace("<NVL_IMG_CONTEXT>", self.IMG_CONTEXT) for t in text]
-
-    def _preprocess_image(
-        self,
-        text: list[str],
-        images: list[Image.Image],
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-    ) -> tuple[list[str], dict[str, torch.Tensor]]:
-        if len(images) == 0:
-            image_inputs = {}
-        else:
-            pixel_values_lst = self._images_to_pixel_values_lst(
-                images,
-                min_dynamic_patch=min_dynamic_patch,
-                max_dynamic_patch=max_dynamic_patch,
-                dynamic_image_size=dynamic_image_size,
-            )
-            image_inputs = {
-                "pixel_values_flat": torch.cat(pixel_values_lst),
-                "image_num_patches": torch.tensor(
-                    [len(item) for item in pixel_values_lst]
-                ),
-            }
-
-            text = self._replace_image_tokens(text, pixel_values_lst)
-        return text, image_inputs
-
-    def get_image_repl(
-        self,
-        feature_size: int,
-        num_patches: int | None,
-    ) -> PromptUpdateDetails[str]:
-        repl_features = self.IMG_CONTEXT * feature_size
-        repl_full = self.IMG_START + repl_features + self.IMG_END
-
-        return PromptUpdateDetails.select_text(repl_full, self.IMG_CONTEXT)
-
-
 class NemotronVLProcessingInfo(BaseInternVLProcessingInfo):
    """Processing info for Nemotron VL models."""

@@ -700,91 +385,6 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, Suppor
 #   - Pooler output instead of generative logits
 # --------------------------------------------------------

-# SigLIP normalization constants
-SIGLIP_MEAN = (0.5, 0.5, 0.5)
-SIGLIP_STD = (0.5, 0.5, 0.5)
-
-
-def build_siglip_transform(input_size: int):
-    """Build transform for SigLIP vision encoder with normalization.
-
-    Extends the base transform from nemotron_vl with SigLIP-specific normalization.
-    """
-    base_transform = build_transform(input_size=input_size)
-    return T.Compose(
-        [
-            base_transform,
-            T.Normalize(mean=SIGLIP_MEAN, std=SIGLIP_STD),
-        ]
-    )
-
-
-class LlamaNemotronVLEmbedProcessor(NemotronVLProcessor):
-    """
-    Processor for LlamaNemotronVL embedding model.
-
-    Inherits from NemotronVLProcessor and specializes it for embedding tasks:
-    - Uses SigLIP transform with normalization instead of base transform
-    - Uses different image context token (<IMG_CONTEXT> vs <image>)
-    """
-
-    IMG_CONTEXT = "<IMG_CONTEXT>"
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        tokenizer: TokenizerLike,
-        processor_config: dict,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-    ) -> None:
-        if min_dynamic_patch is None:
-            min_dynamic_patch = processor_config.get(
-                "min_input_tiles",
-                getattr(config, "min_dynamic_patch", 1),
-            )
-        if max_dynamic_patch is None:
-            max_dynamic_patch = processor_config.get(
-                "max_input_tiles",
-                getattr(config, "max_dynamic_patch", 1),
-            )
-        if dynamic_image_size is None:
-            dynamic_image_size = processor_config.get(
-                "dynamic_image_size",
-                getattr(config, "dynamic_image_size", True),
-            )
-        super().__init__(
-            config=config,
-            tokenizer=tokenizer,
-            image_processor=None,
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-        )
-
-    def _get_transform(self) -> T.Compose:
-        """Override to add SigLIP normalization."""
-        return build_siglip_transform(input_size=self.image_size)
-
-    def _replace_image_tokens(
-        self,
-        text: list[str],
-        pixel_values_lst: list[torch.Tensor],
-    ) -> list[str]:
-        """Override with simpler token replacement for embedding model.
-
-        No temporary placeholder needed because IMG_CONTEXT is <IMG_CONTEXT>,
-        not <image>, so there's no collision risk.
-        """
-        for pixel_values in pixel_values_lst:
-            num_patches = pixel_values.shape[0]
-            feature_size = num_patches * self.num_image_token
-            image_repl = self.get_image_repl(feature_size, num_patches)
-            text = [t.replace("<image>", image_repl.full, 1) for t in text]
-        return text
-

 class LlamaNemotronVLEmbedProcessingInfo(NemotronVLProcessingInfo):
    """Processing info for LlamaNemotronVL embedding model."""
--- a/vllm/model_executor/models/nvlm_d.py
+++ b/vllm/model_executor/models/nvlm_d.py
@@ -27,48 +27,16 @@ from vllm.multimodal.processing import (
    PromptUpdate,
    PromptUpdateDetails,
 )
+from vllm.transformers_utils.processors.nvlm_d import IMG_PAD, NVLMProcessor

 from .intern_vit import InternVisionModel
 from .internvl import (
    BaseInternVLDummyInputsBuilder,
    BaseInternVLMultiModalProcessor,
    BaseInternVLProcessingInfo,
-    BaseInternVLProcessor,
    InternVLChatModel,
 )

-IMG_PAD = "<|vision_pad|>"
-
-
-class NVLMProcessor(BaseInternVLProcessor):
-    @property
-    def image_token_id(self) -> int:
-        return self.tokenizer.get_vocab()[IMG_PAD]
-
-    def get_image_repl(
-        self,
-        feature_size: int,
-        num_patches: int | None,
-    ) -> PromptUpdateDetails[str]:
-        if num_patches is None:
-            raise NotImplementedError("Embedding inputs are not supported")
-
-        tile_pos_identifiers = [f"<tile_{i}>" for i in range(1, num_patches)]
-        if self.use_thumbnail:
-            tile_pos_identifiers += ["<tile_global_thumbnail>"]
-
-        context_size = feature_size // num_patches
-        features = "".join(
-            identifier + IMG_PAD * context_size for identifier in tile_pos_identifiers
-        )
-
-        # We include the start and end as well because "<Image><tile" is
-        # tokenized as ["<Image", "><", "tile"], resulting in assertion error
-        # when trying to find "<tile" as a subsequence of "<Image><tile"
-        repl = "<Image>" + features + "</Image>"
-
-        return PromptUpdateDetails.select_text(repl, IMG_PAD)
-

 class NVLMProcessingInfo(BaseInternVLProcessingInfo):
    def get_hf_processor(self, **kwargs: object) -> NVLMProcessor:
--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -12,9 +12,7 @@ from typing import Annotated, Literal, TypeAlias

 import torch
 import torch.nn as nn
-import torchvision.transforms as T
-from PIL import Image
-from transformers import BatchFeature, PretrainedConfig, TensorType
+from transformers import BatchFeature, PretrainedConfig

 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
@@ -26,7 +24,6 @@ from vllm.model_executor.models.intern_vit import (
    InternVisionPatchModel,
 )
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import convert_image_mode
 from vllm.multimodal.inputs import (
    MultiModalDataDict,
    MultiModalFieldConfig,
@@ -44,22 +41,14 @@ from vllm.multimodal.processing import (
    BaseProcessingInfo,
    PromptReplacement,
    PromptUpdate,
-    PromptUpdateDetails,
 )
 from vllm.sequence import IntermediateTensors
-from vllm.tokenizers import TokenizerLike
+from vllm.transformers_utils.processors.skyworkr1v import SkyworkR1VProcessor
 from vllm.utils.tensor_schema import TensorSchema, TensorShape

 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix

-IMG_START = "<img>"
-IMG_END = "</img>"
-IMG_CONTEXT = "<IMG_CONTEXT>"
-
-IMAGENET_MEAN = (0.485, 0.456, 0.406)
-IMAGENET_STD = (0.229, 0.224, 0.225)
-

 class SkyworkR1VImagePixelInputs(TensorSchema):
    """
@@ -106,370 +95,6 @@ SkyworkR1VImageInputs: TypeAlias = (
 )


-# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
-def build_transform(input_size: int):
-    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
-    return T.Compose(
-        [
-            T.Lambda(lambda img: convert_image_mode(img, "RGB")),
-            T.Resize(
-                (input_size, input_size), interpolation=T.InterpolationMode.BICUBIC
-            ),
-            T.ToTensor(),
-            T.Normalize(mean=MEAN, std=STD),
-        ]
-    )
-
-
-# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
-def find_closest_aspect_ratio(
-    aspect_ratio: float,
-    target_ratios: list[tuple[int, int]],
-    *,
-    width: int,
-    height: int,
-    image_size: int,
-) -> tuple[int, int]:
-    best_ratio_diff = float("inf")
-    best_ratio = (1, 1)
-    area = width * height
-    for ratio in target_ratios:
-        target_aspect_ratio = ratio[0] / ratio[1]
-        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
-        if ratio_diff < best_ratio_diff:
-            best_ratio_diff = ratio_diff
-            best_ratio = ratio
-        elif ratio_diff == best_ratio_diff:
-            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
-                best_ratio = ratio
-    return best_ratio
-
-
-def resolve_skyworkr1v_min_max_num(
-    *,
-    min_dynamic_patch: int,
-    max_dynamic_patch: int,
-    dynamic_image_size: bool,
-    use_thumbnail: bool,
-) -> tuple[int, int]:
-    min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
-    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
-
-    if use_thumbnail and max_dynamic_patch != 1:
-        max_dynamic_patch += 1
-
-    return min_dynamic_patch, max_dynamic_patch
-
-
-def get_skyworkr1v_target_ratios(
-    min_num: int,
-    max_num: int,
-) -> list[tuple[int, int]]:
-    target_ratios = {
-        (i, j)
-        for n in range(min_num, max_num + 1)
-        for i in range(1, n + 1)
-        for j in range(1, n + 1)
-        if min_num <= i * j <= max_num
-    }
-    return sorted(target_ratios, key=lambda x: x[0] * x[1])
-
-
-def calculate_skyworkr1v_targets(
-    *,
-    orig_width: int,
-    orig_height: int,
-    target_ratios: list[tuple[int, int]],
-    image_size: int,
-    use_thumbnail: bool,
-) -> tuple[int, int, int]:
-    aspect_ratio = orig_width / orig_height
-
-    # find the closest aspect ratio to the target
-    target_aspect_ratio = find_closest_aspect_ratio(
-        aspect_ratio,
-        target_ratios,
-        width=orig_width,
-        height=orig_height,
-        image_size=image_size,
-    )
-
-    # calculate the target width and height
-    target_width = image_size * target_aspect_ratio[0]
-    target_height = image_size * target_aspect_ratio[1]
-    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
-
-    # add thumbnail image if num_blocks != 1
-    if use_thumbnail and blocks != 1:
-        blocks += 1
-
-    return blocks, target_width, target_height
-
-
-def dynamic_preprocess_skyworkr1v(
-    image: Image.Image,
-    *,
-    target_ratios: list[tuple[int, int]],
-    image_size: int,
-    use_thumbnail: bool,
-) -> list[Image.Image]:
-    orig_width, orig_height = image.size
-
-    # calculate the number of blocks without thumbnail
-    blocks, target_width, target_height = calculate_skyworkr1v_targets(
-        orig_width=orig_width,
-        orig_height=orig_height,
-        target_ratios=target_ratios,
-        image_size=image_size,
-        use_thumbnail=False,
-    )
-
-    # resize the image
-    resized_img = image.resize((target_width, target_height))
-    processed_images = []
-    for i in range(blocks):
-        box = (
-            (i % (target_width // image_size)) * image_size,
-            (i // (target_width // image_size)) * image_size,
-            ((i % (target_width // image_size)) + 1) * image_size,
-            ((i // (target_width // image_size)) + 1) * image_size,
-        )
-        # split the image
-        split_img = resized_img.crop(box)
-        processed_images.append(split_img)
-
-    assert len(processed_images) == blocks
-
-    if use_thumbnail and len(processed_images) != 1:
-        thumbnail_img = image.resize((image_size, image_size))
-        processed_images.append(thumbnail_img)
-
-    return processed_images
-
-
-# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B
-def image_to_pixel_values_skyworkr1v(
-    image: Image.Image,
-    *,
-    input_size: int,
-    min_num: int,
-    max_num: int,
-    use_thumbnail: bool,
-) -> torch.Tensor:
-    target_ratios = get_skyworkr1v_target_ratios(min_num, max_num)
-
-    transform = build_transform(input_size=input_size)
-    images = dynamic_preprocess_skyworkr1v(
-        image,
-        target_ratios=target_ratios,
-        image_size=input_size,
-        use_thumbnail=use_thumbnail,
-    )
-
-    pixel_values = torch.stack([transform(image) for image in images])
-    return pixel_values
-
-
-class SkyworkR1VProcessor:
-    """
-    This model doesn't define its own HF processor,
-    so we implement our own one here.
-
-    The code to insert image tokens is based on:
-    https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py#L252
-    """
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        tokenizer: TokenizerLike,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-    ) -> None:
-        super().__init__()
-
-        self.config = config
-        self.tokenizer = tokenizer
-
-        image_size: int = config.vision_config.image_size
-        patch_size: int = config.vision_config.patch_size
-
-        if min_dynamic_patch is None:
-            min_dynamic_patch = config.min_dynamic_patch
-        assert isinstance(min_dynamic_patch, int)
-
-        if max_dynamic_patch is None:
-            max_dynamic_patch = config.max_dynamic_patch
-        assert isinstance(max_dynamic_patch, int)
-
-        if dynamic_image_size is None:
-            dynamic_image_size = config.dynamic_image_size
-        assert isinstance(dynamic_image_size, bool)
-
-        self.num_image_token = int(
-            (image_size // patch_size) ** 2 * (config.downsample_ratio**2)
-        )
-        self.image_size = image_size
-        self.min_dynamic_patch = min_dynamic_patch
-        self.max_dynamic_patch = max_dynamic_patch
-        self.dynamic_image_size = dynamic_image_size
-        self.use_thumbnail: bool = config.use_thumbnail
-
-    @property
-    def image_token_id(self) -> int:
-        return self.tokenizer.get_vocab()[IMG_CONTEXT]
-
-    def get_image_repl(
-        self,
-        feature_size: int,
-        num_patches: int | None,
-    ) -> PromptUpdateDetails[str]:
-        repl_features = IMG_CONTEXT * feature_size
-        repl_full = IMG_START + repl_features + IMG_END
-
-        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
-
-    def resolve_min_max_num(
-        self,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-        use_thumbnail: bool | None = None,
-    ) -> tuple[int, int]:
-        min_dynamic_patch = (
-            self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
-        )
-        max_dynamic_patch = (
-            self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
-        )
-        dynamic_image_size = (
-            self.dynamic_image_size
-            if dynamic_image_size is None
-            else dynamic_image_size
-        )
-        use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
-
-        return resolve_skyworkr1v_min_max_num(
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-            use_thumbnail=use_thumbnail,
-        )
-
-    def resolve_target_ratios(
-        self,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-        use_thumbnail: bool | None = None,
-    ) -> list[tuple[int, int]]:
-        min_num, max_num = self.resolve_min_max_num(
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-            use_thumbnail=use_thumbnail,
-        )
-
-        return get_skyworkr1v_target_ratios(min_num, max_num)
-
-    def get_num_image_tokens(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-    ) -> int:
-        target_ratios = self.resolve_target_ratios(
-            use_thumbnail=False,  # Applied in calculate_targets
-        )
-
-        num_patches, _, _ = calculate_skyworkr1v_targets(
-            orig_width=image_width,
-            orig_height=image_height,
-            image_size=self.image_size,
-            target_ratios=target_ratios,
-            use_thumbnail=self.use_thumbnail,
-        )
-
-        return num_patches * self.num_image_token
-
-    def _images_to_pixel_values_lst(
-        self,
-        images: list[Image.Image],
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-    ) -> list[torch.Tensor]:
-        min_num, max_num = self.resolve_min_max_num(
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-            use_thumbnail=False,  # Applied in image_to_pixel_values
-        )
-
-        return [
-            image_to_pixel_values_skyworkr1v(
-                image,
-                input_size=self.image_size,
-                min_num=min_num,
-                max_num=max_num,
-                use_thumbnail=self.use_thumbnail,
-            )
-            for image in images
-        ]
-
-    def __call__(
-        self,
-        text: str | list[str] | None = None,
-        images: Image.Image | list[Image.Image] | None = None,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-        return_tensors: str | TensorType | None = None,
-    ) -> BatchFeature:
-        if text is None:
-            text = []
-        if not isinstance(text, list):
-            text = [text]
-        if images is None:
-            images = []
-        if not isinstance(images, list):
-            images = [images]
-
-        if len(images) == 0:
-            image_inputs = {}
-        else:
-            pixel_values_lst = self._images_to_pixel_values_lst(
-                images,
-                min_dynamic_patch=min_dynamic_patch,
-                max_dynamic_patch=max_dynamic_patch,
-                dynamic_image_size=dynamic_image_size,
-            )
-            image_inputs = {
-                "pixel_values_flat": torch.cat(pixel_values_lst),
-                "image_num_patches": torch.tensor(
-                    [len(item) for item in pixel_values_lst]
-                ),
-            }
-
-            for pixel_values in pixel_values_lst:
-                num_patches = pixel_values.shape[0]
-                feature_size = num_patches * self.num_image_token
-
-                image_repl = self.get_image_repl(feature_size, num_patches)
-
-                text = [t.replace("<image>", image_repl.full, 1) for t in text]
-
-        text_inputs = self.tokenizer(text)
-
-        combined_outputs = {**text_inputs, **image_inputs}
-
-        return BatchFeature(combined_outputs, tensor_type=return_tensors)
-
-
 class SkyworkR1VProcessingInfo(BaseProcessingInfo):
    def get_hf_processor(self, **kwargs: object) -> SkyworkR1VProcessor:
        return self.ctx.init_processor(
--- a/vllm/transformers_utils/processors/init.py
+++ b/vllm/transformers_utils/processors/init.py
@@ -13,35 +13,53 @@ import importlib
 __all__ = [
    "BagelProcessor",
    "DeepseekVLV2Processor",
+    "Eagle2_5_VLProcessor",
    "FireRedASR2Processor",
    "FunASRProcessor",
    "GLM4VProcessor",
+    "H2OVLProcessor",
    "HunYuanVLProcessor",
    "HunYuanVLImageProcessor",
+    "InternVLProcessor",
    "KimiAudioProcessor",
    "MistralCommonPixtralProcessor",
    "MistralCommonVoxtralProcessor",
+    "NanoNemotronVLProcessor",
+    "NemotronParseProcessor",
+    "NemotronVLProcessor",
+    "LlamaNemotronVLEmbedProcessor",
+    "NVLMProcessor",
    "OvisProcessor",
    "Ovis2_5Processor",
    "QwenVLProcessor",
    "Qwen3ASRProcessor",
+    "SkyworkR1VProcessor",
 ]

 _CLASS_TO_MODULE: dict[str, str] = {
    "BagelProcessor": "vllm.transformers_utils.processors.bagel",
    "DeepseekVLV2Processor": "vllm.transformers_utils.processors.deepseek_vl2",
+    "Eagle2_5_VLProcessor": "vllm.transformers_utils.processors.eagle2_5_vl",
    "FireRedASR2Processor": "vllm.transformers_utils.processors.fireredasr2",
    "FunASRProcessor": "vllm.transformers_utils.processors.funasr",
    "GLM4VProcessor": "vllm.transformers_utils.processors.glm4v",
+    "H2OVLProcessor": "vllm.transformers_utils.processors.h2ovl",
    "HunYuanVLProcessor": "vllm.transformers_utils.processors.hunyuan_vl",
    "HunYuanVLImageProcessor": "vllm.transformers_utils.processors.hunyuan_vl_image",
+    "InternVLProcessor": "vllm.transformers_utils.processors.internvl",
    "KimiAudioProcessor": "vllm.transformers_utils.processors.kimi_audio",
    "MistralCommonPixtralProcessor": "vllm.transformers_utils.processors.pixtral",
    "MistralCommonVoxtralProcessor": "vllm.transformers_utils.processors.voxtral",
+    "NanoNemotronVLProcessor": "vllm.transformers_utils.processors.nano_nemotron_vl",
+    "NemotronParseProcessor": "vllm.transformers_utils.processors.nemotron_parse",
+    "NemotronVLProcessor": "vllm.transformers_utils.processors.nemotron_vl",
+    "LlamaNemotronVLEmbedProcessor": "vllm.transformers_utils.processors.nemotron_vl",
+    "NVLMProcessor": "vllm.transformers_utils.processors.nvlm_d",
    "OvisProcessor": "vllm.transformers_utils.processors.ovis",
    "Ovis2_5Processor": "vllm.transformers_utils.processors.ovis2_5",
    "QwenVLProcessor": "vllm.transformers_utils.processors.qwen_vl",
    "Qwen3ASRProcessor": "vllm.transformers_utils.processors.qwen3_asr",
+    "SkyworkR1VProcessor": "vllm.transformers_utils.processors.skyworkr1v",
 }


--- a/vllm/transformers_utils/processors/eagle2_5_vl.py
+++ b/vllm/transformers_utils/processors/eagle2_5_vl.py
@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from NVIDIA Eagle2.5-VL model
+# https://huggingface.co/nvidia/Eagle2.5-8B
+from transformers import PretrainedConfig
+
+from vllm.multimodal.processing import PromptUpdateDetails
+from vllm.tokenizers import TokenizerLike
+
+from .internvl import IMG_CONTEXT, IMG_END, IMG_START, BaseInternVLProcessor
+
+
+class Eagle2_5_VLProcessor(BaseInternVLProcessor):
+    """
+    Custom processor for Eagle2.5-VL model.
+    Extends BaseInternVLProcessor with Eagle-specific token handling.
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: TokenizerLike,
+        *,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+    ) -> None:
+        # Skip super().__init__() to avoid config manipulation
+        # Directly initialize all required attributes
+        self.config = config
+        self.tokenizer = tokenizer
+
+        # Image size with force_image_size override
+        image_size: int = config.vision_config.image_size
+        if hasattr(config, "force_image_size") and config.force_image_size:
+            image_size = config.force_image_size
+
+        patch_size: int = config.vision_config.patch_size
+        downsample_ratio: float = getattr(config, "downsample_ratio", 0.5)
+
+        # Compute num_image_token
+        self.num_image_token = int(
+            (image_size // patch_size) ** 2 * (downsample_ratio**2)
+        )
+        self.image_size = image_size
+
+        # Dynamic patch settings with defaults
+        self.min_dynamic_patch = (
+            min_dynamic_patch
+            if min_dynamic_patch is not None
+            else getattr(config, "min_dynamic_patch", 1)
+        )
+        self.max_dynamic_patch = (
+            max_dynamic_patch
+            if max_dynamic_patch is not None
+            else getattr(config, "max_dynamic_patch", 12)
+        )
+        self.dynamic_image_size = (
+            dynamic_image_size
+            if dynamic_image_size is not None
+            else getattr(config, "dynamic_image_size", True)
+        )
+        self.use_thumbnail: bool = getattr(config, "use_thumbnail", True)
+
+    @property
+    def image_token_id(self) -> int:
+        """Get the image token ID from config or tokenizer."""
+        if hasattr(self.config, "image_token_index"):
+            return self.config.image_token_index
+        # Fallback to tokenizer vocab - use <IMG_CONTEXT> (ID: 151667)
+        vocab = self.tokenizer.get_vocab()
+        if IMG_CONTEXT in vocab:
+            return vocab[IMG_CONTEXT]
+        raise ValueError(f"Cannot find image token '{IMG_CONTEXT}' in vocabulary")
+
+    def get_image_repl(
+        self,
+        feature_size: int,
+        num_patches: int | None,
+    ) -> PromptUpdateDetails[str]:
+        """Get image replacement string for prompt."""
+        repl_features = IMG_CONTEXT * feature_size
+        repl_full = IMG_START + repl_features + IMG_END
+
+        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
--- a/vllm/transformers_utils/processors/h2ovl.py
+++ b/vllm/transformers_utils/processors/h2ovl.py
@@ -0,0 +1,390 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/modeling_h2ovl_chat.py
+# https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/image_process.py
+# --------------------------------------------------------
+# H2OVL-Mississippi
+# Copyright (c) 2024 H2O.AI
+# Licensed under Apache 2.0 License [see LICENSE for details]
+# --------------------------------------------------------
+import torch
+from PIL import Image
+from transformers import PretrainedConfig
+
+from vllm.multimodal.processing import PromptUpdateDetails
+from vllm.tokenizers import TokenizerLike
+
+from .internvl import (
+    IMG_CONTEXT,
+    IMG_END,
+    IMG_START,
+    BaseInternVLProcessor,
+    build_transform,
+    find_closest_aspect_ratio,
+    get_internvl_target_ratios,
+)
+
+
+def resolve_h2ovl_min_max_num(
+    *,
+    min_dynamic_patch: int,
+    max_dynamic_patch: int,
+    dynamic_image_size: bool,
+    use_thumbnail: bool,
+) -> tuple[int, int]:
+    min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
+    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
+
+    if use_thumbnail and max_dynamic_patch != 1:
+        max_dynamic_patch += 1
+
+    return min_dynamic_patch, max_dynamic_patch
+
+
+def get_h2ovl_target_ratios(
+    min_num: int,
+    max_num: int,
+    *,
+    prior_aspect_ratio: tuple[int, int] | None,
+) -> list[tuple[int, int]]:
+    target_ratios = get_internvl_target_ratios(min_num, max_num)
+
+    # if prior_aspect_ratio is provided, filter the target ratios
+    if prior_aspect_ratio is not None:
+        target_ratios = [
+            ratio
+            for ratio in target_ratios
+            if prior_aspect_ratio[0] % ratio[0] != 0
+            and prior_aspect_ratio[1] % ratio[1] != 0
+        ]
+
+    return target_ratios
+
+
+# modified to include blocks generated in second pass
+def calculate_h2ovl_targets(
+    *,
+    orig_width: int,
+    orig_height: int,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> tuple[int, int, int, tuple[int, int]]:
+    aspect_ratio = orig_width / orig_height
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio,
+        target_ratios,
+        width=orig_width,
+        height=orig_height,
+        image_size=image_size,
+    )
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # add thumbnail image if num_blocks != 1
+    if use_thumbnail and blocks != 1:
+        blocks += 1
+
+    return blocks, target_width, target_height, target_aspect_ratio
+
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
+# refactored to handle prior_aspect_ratio
+def dynamic_preprocess_h2ovl(
+    image: Image.Image,
+    *,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> tuple[list[Image.Image], tuple[int, int]]:
+    orig_width, orig_height = image.size
+
+    # calculate the number of blocks without thumbnail
+    (
+        blocks,
+        target_width,
+        target_height,
+        target_aspect_ratio,
+    ) = calculate_h2ovl_targets(
+        orig_width=orig_width,
+        orig_height=orig_height,
+        target_ratios=target_ratios,
+        image_size=image_size,
+        use_thumbnail=False,
+    )
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+
+    assert len(processed_images) == blocks
+
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+
+    return processed_images, target_aspect_ratio
+
+
+def _preprocess_image(
+    image: Image.Image,
+    *,
+    input_size: int,
+    min_num: int,
+    max_num: int,
+    use_thumbnail: bool,
+    prior_aspect_ratio: tuple[int, int] | None,
+) -> tuple[torch.Tensor, tuple[int, int]]:
+    target_ratios = get_h2ovl_target_ratios(
+        min_num,
+        max_num,
+        prior_aspect_ratio=prior_aspect_ratio,
+    )
+
+    transform = build_transform(input_size=input_size)
+    images, target_aspect_ratio = dynamic_preprocess_h2ovl(
+        image,
+        image_size=input_size,
+        use_thumbnail=use_thumbnail,
+        target_ratios=target_ratios,
+    )
+
+    pixel_values = torch.stack([transform(image) for image in images])
+    return pixel_values, target_aspect_ratio
+
+
+# refactored to use the _preprocess_image function
+def image_to_pixel_values_h2ovl(
+    image: Image.Image,
+    *,
+    input_size: int,
+    min_num: int,
+    max_num: int,
+    use_thumbnail: bool,
+    use_msac: bool,
+) -> torch.Tensor:
+    # when MSAC is turned on, we need to process the image twice
+    if use_msac:
+        # first pass
+        pixel_values1, aspect_ratio1 = _preprocess_image(
+            image,
+            input_size=input_size,
+            min_num=1,
+            max_num=max_num,
+            use_thumbnail=True,
+            prior_aspect_ratio=None,
+        )
+        # second pass
+        pixel_values2, _ = _preprocess_image(
+            image,
+            input_size=input_size,
+            min_num=3,
+            max_num=max_num,
+            use_thumbnail=True,
+            prior_aspect_ratio=aspect_ratio1,
+        )
+        # combine pixel values
+        pixel_values = torch.cat(
+            [pixel_values2[:-1], pixel_values1[:-1], pixel_values2[-1:]], 0
+        )
+
+    else:
+        pixel_values, _ = _preprocess_image(
+            image,
+            input_size=input_size,
+            min_num=min_num,
+            max_num=max_num,
+            use_thumbnail=use_thumbnail,
+            prior_aspect_ratio=None,
+        )
+
+    return pixel_values
+
+
+class H2OVLProcessor(BaseInternVLProcessor):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: TokenizerLike,
+        *,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        use_msac: bool | None = None,
+    ) -> None:
+        super().__init__(
+            config,
+            tokenizer,
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+        )
+
+        if use_msac is None:
+            use_msac = config.use_msac
+        assert isinstance(use_msac, bool)
+
+        self.use_msac = use_msac
+
+    @property
+    def image_token_id(self) -> int:
+        return self.tokenizer.get_vocab()[IMG_CONTEXT]
+
+    def get_image_repl(
+        self,
+        feature_size: int,
+        num_patches: int | None,
+    ) -> PromptUpdateDetails[str]:
+        repl_features = IMG_CONTEXT * feature_size
+        repl_full = IMG_START + repl_features + IMG_END
+
+        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
+
+    def resolve_min_max_num(
+        self,
+        *,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        use_thumbnail: bool | None = None,
+    ) -> tuple[int, int]:
+        min_dynamic_patch = (
+            self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
+        )
+        max_dynamic_patch = (
+            self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
+        )
+        dynamic_image_size = (
+            self.dynamic_image_size
+            if dynamic_image_size is None
+            else dynamic_image_size
+        )
+        use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
+
+        return resolve_h2ovl_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=use_thumbnail,
+        )
+
+    def resolve_target_ratios(
+        self,
+        *,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        use_thumbnail: bool | None = None,
+        prior_aspect_ratio: tuple[int, int] | None = None,
+        override_min_num: int | None = None,
+    ) -> list[tuple[int, int]]:
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=use_thumbnail,
+        )
+        if override_min_num is not None:
+            min_num = override_min_num
+
+        return get_h2ovl_target_ratios(
+            min_num,
+            max_num,
+            prior_aspect_ratio=prior_aspect_ratio,
+        )
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        use_msac: bool | None = None,
+    ) -> int:
+        use_msac = self.use_msac if use_msac is None else use_msac
+
+        use_thumbnail = self.use_thumbnail
+
+        if use_msac:
+            target_ratios_1 = self.resolve_target_ratios(
+                use_thumbnail=False,  # Applied in calculate_targets
+                override_min_num=1,
+            )
+            num_patches_1, _, _, aspect_ratio_1 = calculate_h2ovl_targets(
+                orig_width=image_width,
+                orig_height=image_height,
+                image_size=self.image_size,
+                target_ratios=target_ratios_1,
+                use_thumbnail=True,
+            )
+
+            target_ratios_2 = self.resolve_target_ratios(
+                use_thumbnail=False,  # Applied in calculate_targets
+                prior_aspect_ratio=aspect_ratio_1,
+                override_min_num=3,
+            )
+            num_patches_2, _, _, _ = calculate_h2ovl_targets(
+                orig_width=image_width,
+                orig_height=image_height,
+                image_size=self.image_size,
+                target_ratios=target_ratios_2,
+                use_thumbnail=True,
+            )
+
+            num_patches = num_patches_1 + num_patches_2 - 1
+        else:
+            target_ratios = self.resolve_target_ratios(
+                use_thumbnail=False,  # Applied in calculate_targets
+            )
+            num_patches, _, _, _ = calculate_h2ovl_targets(
+                orig_width=image_width,
+                orig_height=image_height,
+                image_size=self.image_size,
+                target_ratios=target_ratios,
+                use_thumbnail=use_thumbnail,
+            )
+
+        return num_patches * self.num_image_token
+
+    def _images_to_pixel_values_lst(
+        self,
+        images: list[Image.Image],
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+    ) -> list[torch.Tensor]:
+        use_msac = self.use_msac if len(images) == 1 else False
+
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=False,  # Applied in image_to_pixel_values
+        )
+
+        return [
+            image_to_pixel_values_h2ovl(
+                image,
+                input_size=self.image_size,
+                min_num=min_num,
+                max_num=max_num,
+                use_thumbnail=self.use_thumbnail,
+                use_msac=use_msac,
+            )
+            for image in images
+        ]
--- a/vllm/transformers_utils/processors/internvl.py
+++ b/vllm/transformers_utils/processors/internvl.py
@@ -0,0 +1,603 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_internvl_chat.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from abc import ABC, abstractmethod
+from typing import Any, TypeVar
+
+import numpy.typing as npt
+import torch
+import torchvision.transforms as T
+from PIL import Image
+from transformers import BatchFeature, PretrainedConfig, TensorType
+
+from vllm.multimodal.image import convert_image_mode
+from vllm.multimodal.processing import PromptUpdateDetails
+from vllm.tokenizers import TokenizerLike
+
+_T = TypeVar("_T")
+
+IMG_START = "<img>"
+IMG_END = "</img>"
+IMG_CONTEXT = "<IMG_CONTEXT>"
+
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
+def build_transform(input_size: int):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    transform = T.Compose(
+        [
+            T.Lambda(lambda img: convert_image_mode(img, "RGB")),
+            T.Resize(
+                (input_size, input_size), interpolation=T.InterpolationMode.BICUBIC
+            ),
+            T.ToTensor(),
+            T.Normalize(mean=MEAN, std=STD),
+        ]
+    )
+    return transform
+
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
+def find_closest_aspect_ratio(
+    aspect_ratio: float,
+    target_ratios: list[tuple[int, int]],
+    *,
+    width: int,
+    height: int,
+    image_size: int,
+) -> tuple[int, int]:
+    best_ratio_diff = float("inf")
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+
+
+def resolve_internvl_min_max_num(
+    *,
+    min_dynamic_patch: int,
+    max_dynamic_patch: int,
+    dynamic_image_size: bool,
+    use_thumbnail: bool,
+) -> tuple[int, int]:
+    min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
+    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
+
+    if use_thumbnail and max_dynamic_patch != 1:
+        max_dynamic_patch += 1
+
+    return min_dynamic_patch, max_dynamic_patch
+
+
+def get_internvl_target_ratios(
+    min_num: int,
+    max_num: int,
+) -> list[tuple[int, int]]:
+    target_ratios = {
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if min_num <= i * j <= max_num
+    }
+    return sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+
+def calculate_internvl_targets(
+    *,
+    orig_width: int,
+    orig_height: int,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> tuple[int, int, int]:
+    aspect_ratio = orig_width / orig_height
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio,
+        target_ratios,
+        width=orig_width,
+        height=orig_height,
+        image_size=image_size,
+    )
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # add thumbnail image if num_blocks != 1
+    if use_thumbnail and blocks != 1:
+        blocks += 1
+
+    return blocks, target_width, target_height
+
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
+def dynamic_preprocess_internvl(
+    image: Image.Image,
+    *,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> list[Image.Image]:
+    orig_width, orig_height = image.size
+
+    # calculate the number of blocks without thumbnail
+    blocks, target_width, target_height = calculate_internvl_targets(
+        orig_width=orig_width,
+        orig_height=orig_height,
+        target_ratios=target_ratios,
+        image_size=image_size,
+        use_thumbnail=False,
+    )
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+
+    assert len(processed_images) == blocks
+
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+
+    return processed_images
+
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
+def image_to_pixel_values_internvl(
+    image: Image.Image,
+    *,
+    input_size: int,
+    min_num: int,
+    max_num: int,
+    use_thumbnail: bool,
+) -> torch.Tensor:
+    target_ratios = get_internvl_target_ratios(min_num, max_num)
+
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess_internvl(
+        image,
+        target_ratios=target_ratios,
+        image_size=input_size,
+        use_thumbnail=use_thumbnail,
+    )
+
+    pixel_values = torch.stack([transform(image) for image in images])
+    return pixel_values
+
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
+def video_to_pixel_values_internvl(
+    video: npt.NDArray,
+    *,
+    input_size: int,
+    min_num: int,
+    max_num: int,
+    use_thumbnail: bool,
+) -> torch.Tensor:
+    target_ratios = get_internvl_target_ratios(min_num, max_num)
+
+    transform = build_transform(input_size=input_size)
+    frames_list = list[Image.Image]()
+    for frame in video:
+        pil_frame = dynamic_preprocess_internvl(
+            Image.fromarray(frame, mode="RGB"),
+            target_ratios=target_ratios,
+            image_size=input_size,
+            use_thumbnail=use_thumbnail,
+        )
+        assert len(pil_frame) == 1
+        frames_list.extend(pil_frame)
+
+    pixel_values = torch.stack([transform(image) for image in frames_list])
+    return pixel_values
+
+
+class BaseInternVLProcessor(ABC):
+    """
+    This model doesn't define its own HF processor,
+    so we implement our own one here.
+
+    The code to insert image tokens is based on:
+    https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: TokenizerLike,
+        *,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.tokenizer = tokenizer
+
+        image_size: int = config.vision_config.image_size
+        patch_size: int = config.vision_config.patch_size
+
+        if min_dynamic_patch is None:
+            min_dynamic_patch = config.min_dynamic_patch
+        assert isinstance(min_dynamic_patch, int)
+
+        if max_dynamic_patch is None:
+            max_dynamic_patch = config.max_dynamic_patch
+        assert isinstance(max_dynamic_patch, int)
+
+        if dynamic_image_size is None:
+            dynamic_image_size = config.dynamic_image_size
+        assert isinstance(dynamic_image_size, bool)
+
+        self.num_image_token = int(
+            (image_size // patch_size) ** 2 * (config.downsample_ratio**2)
+        )
+        self.image_size = image_size
+        self.min_dynamic_patch = min_dynamic_patch
+        self.max_dynamic_patch = max_dynamic_patch
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail: bool = config.use_thumbnail
+
+    @property
+    @abstractmethod
+    def image_token_id(self) -> int:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_image_repl(
+        self,
+        feature_size: int,
+        num_patches: int | None,
+    ) -> PromptUpdateDetails[str]:
+        raise NotImplementedError
+
+    def resolve_min_max_num(
+        self,
+        *,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        use_thumbnail: bool | None = None,
+    ) -> tuple[int, int]:
+        min_dynamic_patch = (
+            self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
+        )
+        max_dynamic_patch = (
+            self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
+        )
+        dynamic_image_size = (
+            self.dynamic_image_size
+            if dynamic_image_size is None
+            else dynamic_image_size
+        )
+        use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
+
+        return resolve_internvl_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=use_thumbnail,
+        )
+
+    def resolve_target_ratios(
+        self,
+        *,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        use_thumbnail: bool | None = None,
+    ) -> list[tuple[int, int]]:
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=use_thumbnail,
+        )
+
+        return get_internvl_target_ratios(min_num, max_num)
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        target_ratios = self.resolve_target_ratios(
+            use_thumbnail=False,  # Applied in calculate_targets
+        )
+
+        num_patches, _, _ = calculate_internvl_targets(
+            orig_width=image_width,
+            orig_height=image_height,
+            image_size=self.image_size,
+            target_ratios=target_ratios,
+            use_thumbnail=self.use_thumbnail,
+        )
+
+        return num_patches * self.num_image_token
+
+    def _images_to_pixel_values_lst(
+        self,
+        images: list[Image.Image],
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+    ) -> list[torch.Tensor]:
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=False,  # Applied in image_to_pixel_values
+        )
+
+        return [
+            image_to_pixel_values_internvl(
+                image,
+                input_size=self.image_size,
+                min_num=min_num,
+                max_num=max_num,
+                use_thumbnail=self.use_thumbnail,
+            )
+            for image in images
+        ]
+
+    def _preprocess_image(
+        self,
+        text: list[str],
+        images: list[Image.Image],
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+    ) -> tuple[list[str], dict[str, torch.Tensor]]:
+        if len(images) == 0:
+            image_inputs = {}
+        else:
+            pixel_values_lst = self._images_to_pixel_values_lst(
+                images,
+                min_dynamic_patch=min_dynamic_patch,
+                max_dynamic_patch=max_dynamic_patch,
+                dynamic_image_size=dynamic_image_size,
+            )
+            image_inputs = {
+                "pixel_values_flat": torch.cat(pixel_values_lst),
+                "image_num_patches": torch.tensor(
+                    [len(item) for item in pixel_values_lst]
+                ),
+            }
+
+            for pixel_values in pixel_values_lst:
+                num_patches = pixel_values.shape[0]
+                feature_size = num_patches * self.num_image_token
+
+                image_repl = self.get_image_repl(feature_size, num_patches)
+                text = [t.replace("<image>", image_repl.full, 1) for t in text]
+        return text, image_inputs
+
+    def _make_batch_input(self, input_item: _T | list[_T] | None = None) -> list[_T]:
+        if input_item is None:
+            input_item = []
+        if not isinstance(input_item, list):
+            input_item = [input_item]
+        return input_item
+
+    def __call__(
+        self,
+        text: str | list[str] | None = None,
+        images: Image.Image | list[Image.Image] | None = None,
+        *,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        return_tensors: str | TensorType | None = None,
+        **kwargs,
+    ) -> BatchFeature:
+        text = self._make_batch_input(text)
+        images = self._make_batch_input(images)
+
+        text, image_inputs = self._preprocess_image(
+            text=text,
+            images=images,
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+        )
+
+        text_inputs = self.tokenizer(text)
+
+        combined_outputs = {**text_inputs, **image_inputs}
+
+        return BatchFeature(combined_outputs, tensor_type=return_tensors)
+
+
+class InternVLProcessor(BaseInternVLProcessor):
+    """
+    HF Processor for InternVLChatModel with extended video processing logic.
+
+    Code for video processing is adapted from video example:
+    https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: TokenizerLike,
+        *,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        video_token: str | None = None,
+    ) -> None:
+        super().__init__(
+            config=config,
+            tokenizer=tokenizer,
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+        )
+        # add extra video token for video processing
+        self.video_token = video_token
+
+    @property
+    def image_token_id(self) -> int:
+        return self.tokenizer.get_vocab()[IMG_CONTEXT]
+
+    @property
+    def video_token_id(self) -> int | None:
+        if self.video_token is None:
+            return None
+        return self.tokenizer.get_vocab().get(self.video_token, None)
+
+    @property
+    def supports_video(self) -> bool:
+        return self.video_token_id is not None
+
+    def _videos_to_pixel_values_lst(
+        self,
+        videos: list[npt.NDArray],
+        dynamic_image_size: bool | None = None,
+    ) -> list[torch.Tensor]:
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=1,
+            max_dynamic_patch=1,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=False,  # Applied in image_to_pixel_values
+        )
+
+        return [
+            video_to_pixel_values_internvl(
+                video,
+                input_size=self.image_size,
+                min_num=min_num,
+                max_num=max_num,
+                use_thumbnail=False,
+            )
+            for video in videos
+        ]
+
+    def _preprocess_video(
+        self,
+        text: list[str],
+        videos: list[npt.NDArray],
+        dynamic_image_size: bool | None = None,
+    ) -> tuple[list[str], dict[str, Any]]:
+        if len(videos) == 0 or not self.supports_video:
+            return text, {}
+
+        video_token = self.video_token
+        assert video_token is not None
+
+        pixel_values_lst_video = self._videos_to_pixel_values_lst(
+            videos,
+            dynamic_image_size=dynamic_image_size,
+        )
+        video_inputs = {
+            "pixel_values_flat_video": torch.cat(pixel_values_lst_video),
+            "video_num_patches": torch.tensor(
+                [len(item) for item in pixel_values_lst_video]
+            ),
+        }
+
+        for pixel_values in pixel_values_lst_video:
+            num_patches = pixel_values.shape[0]
+
+            video_repl = self.get_video_repl(
+                self.num_image_token, num_patches, video_token
+            )
+            text = [t.replace("<video>", video_repl.full, 1) for t in text]
+        return text, video_inputs
+
+    def __call__(
+        self,
+        text: str | list[str] | None = None,
+        images: Image.Image | list[Image.Image] | None = None,
+        videos: npt.NDArray | list[npt.NDArray] | None = None,
+        *,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        return_tensors: str | TensorType | None = None,
+        **kwargs,
+    ) -> BatchFeature:
+        text = self._make_batch_input(text)
+        images = self._make_batch_input(images)
+        videos = self._make_batch_input(videos)
+
+        text, image_inputs = self._preprocess_image(
+            text=text,
+            images=images,
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+        )
+
+        text, video_inputs = self._preprocess_video(
+            text=text,
+            videos=videos,
+            dynamic_image_size=dynamic_image_size,
+        )
+
+        text_inputs = self.tokenizer(text)
+
+        combined_outputs = {**text_inputs, **image_inputs, **video_inputs}
+
+        return BatchFeature(combined_outputs, tensor_type=return_tensors)
+
+    def get_image_repl(
+        self,
+        feature_size: int,
+        num_patches: int | None,
+    ) -> PromptUpdateDetails[str]:
+        repl_features = IMG_CONTEXT * feature_size
+        repl_full = IMG_START + repl_features + IMG_END
+
+        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
+
+    def get_video_repl(
+        self,
+        feature_size: int,
+        num_patches: int | None,
+        video_context_token: str = IMG_CONTEXT,
+    ) -> PromptUpdateDetails[str]:
+        if num_patches is None:
+            raise NotImplementedError("Embedding inputs are not supported")
+
+        repl_features = video_context_token * self.num_image_token
+        repl_features_with_sep = IMG_START + repl_features + IMG_END
+        # num_patches is equal to num_frames
+        repl_full = "".join(
+            [f"Frame{i + 1}: {repl_features_with_sep}" for i in range(num_patches)]
+        )
+
+        return PromptUpdateDetails.select_text(repl_full, video_context_token)
--- a/vllm/transformers_utils/processors/nano_nemotron_vl.py
+++ b/vllm/transformers_utils/processors/nano_nemotron_vl.py
--- a/vllm/transformers_utils/processors/nemotron_parse.py
+++ b/vllm/transformers_utils/processors/nemotron_parse.py
@@ -0,0 +1,245 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Adapted from https://github.com/amalad/vllm/blob/nemotron_parse/vllm/model_executor/models/nemotron_parse.py
+# that's based on https://huggingface.co/nvidia/NVIDIA-Nemotron-Parse-v1.1/blob/main/hf_nemotron_parse_modeling.py
+from typing import TypeVar
+
+import numpy as np
+import torch
+from PIL import Image
+from timm.data.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
+from torchvision import transforms as T
+from transformers import BatchFeature, PretrainedConfig, TensorType
+
+from vllm.tokenizers import TokenizerLike
+
+_T = TypeVar("_T")
+
+DEFAULT_FINAL_IMAGE_SIZE = (2048, 1648)
+
+
+class NemotronParseImageProcessor:
+    """
+    NemotronParse Image Processor
+    """
+
+    def __init__(
+        self,
+        final_size: tuple = DEFAULT_FINAL_IMAGE_SIZE,
+        **kwargs,
+    ):
+        # Ensure final_size is properly formatted
+        if isinstance(final_size, (list, tuple)) and len(final_size) >= 2:
+            self.final_size = (int(final_size[0]), int(final_size[1]))
+        elif isinstance(final_size, (int, float)):
+            self.final_size = (int(final_size), int(final_size))
+        else:
+            self.final_size = DEFAULT_FINAL_IMAGE_SIZE  # Default fallback
+
+        self.norm_mean = torch.Tensor(OPENAI_CLIP_MEAN).reshape(1, 3, 1, 1)
+        self.norm_std = torch.Tensor(OPENAI_CLIP_STD).reshape(1, 3, 1, 1)
+
+        # Create transforms
+        self._create_transforms()
+
+    def _create_transforms(self):
+        """Create transform objects."""
+        try:
+            import albumentations as A
+        except ImportError as err:
+            raise ImportError(
+                "The package `albumentations` is required to use "
+                "NemotronParse model. Please install it with `pip install "
+                "albumentations`."
+            ) from err
+
+        # Ensure final_size is a tuple of integers
+        if isinstance(self.final_size, (list, tuple)):
+            self.target_height, self.target_width = (
+                int(self.final_size[0]),
+                int(self.final_size[1]),
+            )
+        else:
+            self.target_height = self.target_width = int(self.final_size)
+
+        import cv2
+
+        self.transform = A.Compose(
+            [
+                A.PadIfNeeded(
+                    min_height=self.target_height,
+                    min_width=self.target_width,
+                    border_mode=cv2.BORDER_CONSTANT,
+                    fill=[255, 255, 255],
+                    p=1.0,
+                ),
+            ]
+        )
+
+        self.torch_transform = T.Compose(
+            [
+                T.ToTensor(),
+            ]
+        )
+
+    def _resize_with_aspect_ratio(self, image: np.ndarray) -> np.ndarray:
+        """Resize image maintaining aspect ratio (exact replica of original
+        LongestMaxSizeHW)."""
+        height, width = image.shape[:2]
+        max_size_height = self.target_height
+        max_size_width = self.target_width
+
+        # Original LongestMaxSizeHW algorithm from custom_augmentations.py
+        aspect_ratio = width / height
+        new_height = height
+        new_width = width
+
+        # If height too big then scale image down
+        if height > max_size_height:
+            new_height = max_size_height
+            new_width = int(new_height * aspect_ratio)
+
+        # If width too big, scale image down further
+        if new_width > max_size_width:
+            new_width = max_size_width
+            new_height = int(new_width / aspect_ratio)
+
+        # Use cv2.INTER_LINEAR like the original
+        import cv2
+
+        return cv2.resize(
+            image, (new_width, new_height), interpolation=cv2.INTER_LINEAR
+        )
+
+    def _pad_to_size(self, image: np.ndarray) -> np.ndarray:
+        """Pad image to target size with white padding (matches A.PadIfNeeded
+        behavior)."""
+        h, w = image.shape[:2]
+        min_height, min_width = self.target_height, self.target_width
+
+        # Only pad if image is smaller than target (matches A.PadIfNeeded logic)
+        pad_h = max(0, min_height - h)
+        pad_w = max(0, min_width - w)
+
+        if pad_h == 0 and pad_w == 0:
+            return image
+
+        # A.PadIfNeeded pads to bottom-right with constant value
+        if len(image.shape) == 3:
+            # Color image - pad bottom and right with white (255, 255, 255)
+            padded = np.pad(
+                image,
+                ((0, pad_h), (0, pad_w), (0, 0)),
+                mode="constant",
+                constant_values=255,
+            )
+        else:
+            # Grayscale image - pad with white (255)
+            padded = np.pad(
+                image, ((0, pad_h), (0, pad_w)), mode="constant", constant_values=255
+            )
+
+        return padded
+
+    def preprocess(
+        self,
+        images: Image.Image | list[Image.Image],
+        **kwargs,
+    ) -> dict[str, torch.Tensor]:
+        """
+        Preprocess an image or batch of images for the NemotronParse model.
+
+        Args:
+            images: Input image(s)
+        """
+        # Ensure images is a list
+        if not isinstance(images, list):
+            images = [images]
+
+        # Convert PIL images to numpy arrays if needed
+        processed_images = []
+        for image in images:
+            if isinstance(image, Image.Image):
+                image = np.asarray(image)
+            processed_images.append(image)
+
+        # Apply NemotronParse-specific transforms
+        pixel_values = []
+        for image in processed_images:
+            # Manual resize with aspect ratio preservation
+            # (replaces LongestMaxSizeHW)
+            processed_image = self._resize_with_aspect_ratio(image)
+
+            # Apply remaining albumentations transforms if available
+            if self.transform is not None:
+                transformed = self.transform(image=processed_image)
+                processed_image = transformed["image"]
+            else:
+                # Fallback: just pad to target size
+                processed_image = self._pad_to_size(processed_image)
+
+            # Convert to tensor
+            pixel_values_tensor = self.torch_transform(processed_image)
+
+            # Handle grayscale images
+            if pixel_values_tensor.shape[0] == 1:
+                pixel_values_tensor = pixel_values_tensor.expand(3, -1, -1)
+
+            pixel_values.append(pixel_values_tensor)
+
+        # Stack into batch
+        pixel_values = torch.stack(pixel_values)
+
+        # Normalize pixel values
+        normalized_values = (pixel_values - self.norm_mean) / self.norm_std
+        return {"pixel_values": normalized_values}
+
+    def __call__(
+        self, images: Image.Image | list[Image.Image], **kwargs
+    ) -> dict[str, torch.Tensor]:
+        return self.preprocess(images, **kwargs)
+
+
+class NemotronParseProcessor:
+    """
+    NemotronParse Processor
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: TokenizerLike,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.tokenizer = tokenizer
+
+        self.image_processor = NemotronParseImageProcessor(final_size=config.image_size)
+
+    def _make_batch_input(self, input_item: _T | list[_T] | None = None) -> list[_T]:
+        if input_item is None:
+            input_item = []
+        if not isinstance(input_item, list):
+            input_item = [input_item]
+        return input_item
+
+    def __call__(
+        self,
+        text: str | list[str] | None = None,
+        images: Image.Image | list[Image.Image] | None = None,
+        return_tensors: str | TensorType | None = None,
+        **kwargs,
+    ) -> BatchFeature:
+        text = self._make_batch_input(text)
+        images = self._make_batch_input(images)
+        image_inputs = {} if len(images) == 0 else self.image_processor(images)
+
+        text_inputs = self.tokenizer(text, add_special_tokens=False, **kwargs)
+        combined_outputs = BatchFeature(
+            data={**text_inputs, **image_inputs},
+            tensor_type=return_tensors,
+        )
+        return combined_outputs
--- a/vllm/transformers_utils/processors/nemotron_vl.py
+++ b/vllm/transformers_utils/processors/nemotron_vl.py
@@ -0,0 +1,410 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC
+
+import torch
+import torchvision.transforms as T
+from PIL import Image
+from transformers import PretrainedConfig
+from transformers.image_processing_utils_fast import BaseImageProcessorFast
+
+from vllm.multimodal.image import convert_image_mode
+from vllm.multimodal.processing import PromptUpdateDetails
+from vllm.tokenizers import TokenizerLike
+
+from .internvl import InternVLProcessor
+
+# Configure PIL to handle large images without warnings
+# This prevents DecompressionBombWarning for legitimate large images
+Image.MAX_IMAGE_PIXELS = None  # Disable the limit entirely
+# Alternative: Set a specific higher limit
+# Image.MAX_IMAGE_PIXELS = 300000000  # ~300M pixels
+
+
+def build_transform(input_size: int):
+    return T.Compose(
+        [
+            T.Lambda(lambda img: convert_image_mode(img, "RGB")),
+            T.Resize(
+                (input_size, input_size), interpolation=T.InterpolationMode.BICUBIC
+            ),
+            T.ToTensor(),
+        ]
+    )
+
+
+# adapted from https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1
+def find_closest_aspect_ratio(
+    aspect_ratio: float,
+    target_ratios: list[tuple[int, int]],
+    *,
+    width: int,
+    height: int,
+    image_size: int,
+) -> tuple[int, int]:
+    best_factor = float("-inf")
+    best_ratio = (1, 1)
+    area = width * height
+
+    for rw, rh in target_ratios:
+        target_aspect_ratio = rw / rh
+        size_factor = min((rw * rh * image_size * image_size) / area, 0.6)
+        ratio_closeness = min(
+            target_aspect_ratio / aspect_ratio, aspect_ratio / target_aspect_ratio
+        )
+        factor = size_factor * ratio_closeness
+
+        if factor > best_factor:
+            best_factor = factor
+            best_ratio = (rw, rh)
+
+    return best_ratio
+
+
+def calculate_nemotron_vl_targets(
+    *,
+    orig_width: int,
+    orig_height: int,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> tuple[int, int, int]:
+    aspect_ratio = orig_width / orig_height
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio,
+        target_ratios,
+        width=orig_width,
+        height=orig_height,
+        image_size=image_size,
+    )
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # add thumbnail image if num_blocks != 1
+    if use_thumbnail and blocks != 1:
+        blocks += 1
+
+    return blocks, target_width, target_height
+
+
+def dynamic_preprocess_nemotron_vl(
+    image: Image.Image,
+    *,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> list[Image.Image]:
+    orig_width, orig_height = image.size
+
+    # calculate the number of blocks without thumbnail
+    blocks, target_width, target_height = calculate_nemotron_vl_targets(
+        orig_width=orig_width,
+        orig_height=orig_height,
+        target_ratios=target_ratios,
+        image_size=image_size,
+        use_thumbnail=False,
+    )
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+
+    assert len(processed_images) == blocks
+
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+
+    return processed_images
+
+
+def get_nemotron_vl_target_ratios(
+    min_num: int,
+    max_num: int,
+) -> list[tuple[int, int]]:
+    target_ratios = {
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if min_num <= i * j <= max_num
+    }
+    return sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+
+def image_to_pixel_values_nemotron_vl(
+    image: Image.Image,
+    *,
+    input_size: int,
+    min_num: int,
+    max_num: int,
+    use_thumbnail: bool,
+    transform: T.Compose | None = None,
+) -> torch.Tensor:
+    target_ratios = get_nemotron_vl_target_ratios(min_num, max_num)
+
+    if transform is None:
+        transform = build_transform(input_size=input_size)
+
+    images = dynamic_preprocess_nemotron_vl(
+        image,
+        target_ratios=target_ratios,
+        image_size=input_size,
+        use_thumbnail=use_thumbnail,
+    )
+
+    pixel_values = torch.stack([transform(image) for image in images])
+    return pixel_values
+
+
+class NemotronVLProcessor(InternVLProcessor):
+    IMG_START = "<img>"
+    IMG_END = "</img>"
+    IMG_CONTEXT = "<image>"
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: TokenizerLike,
+        image_processor: BaseImageProcessorFast,
+        *,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+    ) -> None:
+        ABC.__init__(self)
+        self.config = config
+        self.tokenizer = tokenizer
+        self.image_processor = image_processor
+        image_size: int = config.force_image_size
+        patch_size: int = config.patch_size
+
+        if min_dynamic_patch is None:
+            min_dynamic_patch = 1
+        assert isinstance(min_dynamic_patch, int)
+
+        if max_dynamic_patch is None:
+            max_dynamic_patch = self.image_processor.max_num_tiles
+        assert isinstance(max_dynamic_patch, int)
+
+        if dynamic_image_size is None:
+            dynamic_image_size = True
+        assert isinstance(dynamic_image_size, bool)
+
+        self.num_image_token = int(
+            (image_size // patch_size) ** 2 * (config.downsample_ratio**2)
+        )
+        self.image_size = image_size
+        self.min_dynamic_patch = min_dynamic_patch
+        self.max_dynamic_patch = max_dynamic_patch
+        self.dynamic_image_size = dynamic_image_size
+
+        if image_processor is not None:
+            self.use_thumbnail = image_processor.use_thumbnail
+        else:
+            self.use_thumbnail = getattr(config, "use_thumbnail", True)
+
+    @property
+    def image_token_id(self) -> int:
+        return self.tokenizer.get_vocab()[self.IMG_CONTEXT]
+
+    def _get_transform(self) -> T.Compose:
+        return build_transform(input_size=self.image_size)
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        target_ratios = self.resolve_target_ratios(
+            use_thumbnail=False,  # Applied in calculate_targets
+        )
+
+        num_patches, _, _ = calculate_nemotron_vl_targets(
+            orig_width=image_width,
+            orig_height=image_height,
+            image_size=self.image_size,
+            target_ratios=target_ratios,
+            use_thumbnail=self.use_thumbnail,
+        )
+
+        return num_patches * self.num_image_token
+
+    def _images_to_pixel_values_lst(
+        self,
+        images: list[Image.Image],
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+    ) -> list[torch.Tensor]:
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=False,  # Applied in image_to_pixel_values
+        )
+
+        return [
+            image_to_pixel_values_nemotron_vl(
+                image,
+                input_size=self.image_size,
+                min_num=min_num,
+                max_num=max_num,
+                use_thumbnail=self.use_thumbnail,
+                transform=self._get_transform(),
+            )
+            for image in images
+        ]
+
+    def _replace_image_tokens(
+        self,
+        text: list[str],
+        pixel_values_lst: list[torch.Tensor],
+    ) -> list[str]:
+        """Replace <image> placeholders with image tokens."""
+        for pixel_values in pixel_values_lst:
+            num_patches = pixel_values.shape[0]
+            feature_size = num_patches * self.num_image_token
+            image_repl = self.get_image_repl(feature_size, num_patches)
+            # Use temporary placeholder to avoid replacing tokens we just inserted
+            NVL_IMAGE_CONTEXT = image_repl.full.replace("<image>", "<NVL_IMG_CONTEXT>")
+            text = [t.replace("<image>", NVL_IMAGE_CONTEXT, 1) for t in text]
+        return [t.replace("<NVL_IMG_CONTEXT>", self.IMG_CONTEXT) for t in text]
+
+    def _preprocess_image(
+        self,
+        text: list[str],
+        images: list[Image.Image],
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+    ) -> tuple[list[str], dict[str, torch.Tensor]]:
+        if len(images) == 0:
+            image_inputs = {}
+        else:
+            pixel_values_lst = self._images_to_pixel_values_lst(
+                images,
+                min_dynamic_patch=min_dynamic_patch,
+                max_dynamic_patch=max_dynamic_patch,
+                dynamic_image_size=dynamic_image_size,
+            )
+            image_inputs = {
+                "pixel_values_flat": torch.cat(pixel_values_lst),
+                "image_num_patches": torch.tensor(
+                    [len(item) for item in pixel_values_lst]
+                ),
+            }
+
+            text = self._replace_image_tokens(text, pixel_values_lst)
+        return text, image_inputs
+
+    def get_image_repl(
+        self,
+        feature_size: int,
+        num_patches: int | None,
+    ) -> PromptUpdateDetails[str]:
+        repl_features = self.IMG_CONTEXT * feature_size
+        repl_full = self.IMG_START + repl_features + self.IMG_END
+
+        return PromptUpdateDetails.select_text(repl_full, self.IMG_CONTEXT)
+
+
+# SigLIP normalization constants
+SIGLIP_MEAN = (0.5, 0.5, 0.5)
+SIGLIP_STD = (0.5, 0.5, 0.5)
+
+
+def build_siglip_transform(input_size: int):
+    """Build transform for SigLIP vision encoder with normalization.
+
+    Extends the base transform from nemotron_vl with SigLIP-specific normalization.
+    """
+    return T.Compose(
+        [
+            build_transform(input_size=input_size),
+            T.Normalize(mean=SIGLIP_MEAN, std=SIGLIP_STD),
+        ]
+    )
+
+
+class LlamaNemotronVLEmbedProcessor(NemotronVLProcessor):
+    """
+    Processor for LlamaNemotronVL embedding model.
+
+    Inherits from NemotronVLProcessor and specializes it for embedding tasks:
+    - Uses SigLIP transform with normalization instead of base transform
+    - Uses different image context token (<IMG_CONTEXT> vs <image>)
+    """
+
+    IMG_CONTEXT = "<IMG_CONTEXT>"
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: TokenizerLike,
+        processor_config: dict,
+        *,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+    ) -> None:
+        if min_dynamic_patch is None:
+            min_dynamic_patch = processor_config.get(
+                "min_input_tiles",
+                getattr(config, "min_dynamic_patch", 1),
+            )
+        if max_dynamic_patch is None:
+            max_dynamic_patch = processor_config.get(
+                "max_input_tiles",
+                getattr(config, "max_dynamic_patch", 1),
+            )
+        if dynamic_image_size is None:
+            dynamic_image_size = processor_config.get(
+                "dynamic_image_size",
+                getattr(config, "dynamic_image_size", True),
+            )
+        super().__init__(
+            config=config,
+            tokenizer=tokenizer,
+            image_processor=None,
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+        )
+
+    def _get_transform(self) -> T.Compose:
+        """Override to add SigLIP normalization."""
+        return build_siglip_transform(input_size=self.image_size)
+
+    def _replace_image_tokens(
+        self,
+        text: list[str],
+        pixel_values_lst: list[torch.Tensor],
+    ) -> list[str]:
+        """Override with simpler token replacement for embedding model.
+
+        No temporary placeholder needed because IMG_CONTEXT is <IMG_CONTEXT>,
+        not <image>, so there's no collision risk.
+        """
+        for pixel_values in pixel_values_lst:
+            num_patches = pixel_values.shape[0]
+            feature_size = num_patches * self.num_image_token
+            image_repl = self.get_image_repl(feature_size, num_patches)
+            text = [t.replace("<image>", image_repl.full, 1) for t in text]
+        return text
--- a/vllm/transformers_utils/processors/nvlm_d.py
+++ b/vllm/transformers_utils/processors/nvlm_d.py
@@ -0,0 +1,44 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://huggingface.co/nvidia/NVLM-D-72B/blob/main/modeling_nvlm_d.py
+# --------------------------------------------------------
+# NVLM-D
+# Copyright (c) 2024 NVIDIA
+# Licensed under Apache 2.0 License [see LICENSE for details]
+# --------------------------------------------------------
+from vllm.multimodal.processing import PromptUpdateDetails
+
+from .internvl import BaseInternVLProcessor
+
+IMG_PAD = "<|vision_pad|>"
+
+
+class NVLMProcessor(BaseInternVLProcessor):
+    @property
+    def image_token_id(self) -> int:
+        return self.tokenizer.get_vocab()[IMG_PAD]
+
+    def get_image_repl(
+        self,
+        feature_size: int,
+        num_patches: int | None,
+    ) -> PromptUpdateDetails[str]:
+        if num_patches is None:
+            raise NotImplementedError("Embedding inputs are not supported")
+
+        tile_pos_identifiers = [f"<tile_{i}>" for i in range(1, num_patches)]
+        if self.use_thumbnail:
+            tile_pos_identifiers += ["<tile_global_thumbnail>"]
+
+        context_size = feature_size // num_patches
+        features = "".join(
+            identifier + IMG_PAD * context_size for identifier in tile_pos_identifiers
+        )
+
+        # We include the start and end as well because "<Image><tile" is
+        # tokenized as ["<Image", "><", "tile"], resulting in assertion error
+        # when trying to find "<tile" as a subsequence of "<Image><tile"
+        repl = "<Image>" + features + "</Image>"
+
+        return PromptUpdateDetails.select_text(repl, IMG_PAD)
--- a/vllm/transformers_utils/processors/skyworkr1v.py
+++ b/vllm/transformers_utils/processors/skyworkr1v.py
@@ -0,0 +1,389 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py
+# --------------------------------------------------------
+# SkyworkR1V
+# Copyright (c) 2025 Skywork
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+
+import torch
+import torchvision.transforms as T
+from PIL import Image
+from transformers import BatchFeature, PretrainedConfig, TensorType
+
+from vllm.multimodal.image import convert_image_mode
+from vllm.multimodal.processing import PromptUpdateDetails
+from vllm.tokenizers import TokenizerLike
+
+IMG_START = "<img>"
+IMG_END = "</img>"
+IMG_CONTEXT = "<IMG_CONTEXT>"
+
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+
+
+# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
+def build_transform(input_size: int):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    return T.Compose(
+        [
+            T.Lambda(lambda img: convert_image_mode(img, "RGB")),
+            T.Resize(
+                (input_size, input_size), interpolation=T.InterpolationMode.BICUBIC
+            ),
+            T.ToTensor(),
+            T.Normalize(mean=MEAN, std=STD),
+        ]
+    )
+
+
+# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
+def find_closest_aspect_ratio(
+    aspect_ratio: float,
+    target_ratios: list[tuple[int, int]],
+    *,
+    width: int,
+    height: int,
+    image_size: int,
+) -> tuple[int, int]:
+    best_ratio_diff = float("inf")
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+
+
+def resolve_skyworkr1v_min_max_num(
+    *,
+    min_dynamic_patch: int,
+    max_dynamic_patch: int,
+    dynamic_image_size: bool,
+    use_thumbnail: bool,
+) -> tuple[int, int]:
+    min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
+    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
+
+    if use_thumbnail and max_dynamic_patch != 1:
+        max_dynamic_patch += 1
+
+    return min_dynamic_patch, max_dynamic_patch
+
+
+def get_skyworkr1v_target_ratios(
+    min_num: int,
+    max_num: int,
+) -> list[tuple[int, int]]:
+    target_ratios = {
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if min_num <= i * j <= max_num
+    }
+    return sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+
+def calculate_skyworkr1v_targets(
+    *,
+    orig_width: int,
+    orig_height: int,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> tuple[int, int, int]:
+    aspect_ratio = orig_width / orig_height
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio,
+        target_ratios,
+        width=orig_width,
+        height=orig_height,
+        image_size=image_size,
+    )
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # add thumbnail image if num_blocks != 1
+    if use_thumbnail and blocks != 1:
+        blocks += 1
+
+    return blocks, target_width, target_height
+
+
+def dynamic_preprocess_skyworkr1v(
+    image: Image.Image,
+    *,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> list[Image.Image]:
+    orig_width, orig_height = image.size
+
+    # calculate the number of blocks without thumbnail
+    blocks, target_width, target_height = calculate_skyworkr1v_targets(
+        orig_width=orig_width,
+        orig_height=orig_height,
+        target_ratios=target_ratios,
+        image_size=image_size,
+        use_thumbnail=False,
+    )
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+
+    assert len(processed_images) == blocks
+
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+
+    return processed_images
+
+
+# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B
+def image_to_pixel_values_skyworkr1v(
+    image: Image.Image,
+    *,
+    input_size: int,
+    min_num: int,
+    max_num: int,
+    use_thumbnail: bool,
+) -> torch.Tensor:
+    target_ratios = get_skyworkr1v_target_ratios(min_num, max_num)
+
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess_skyworkr1v(
+        image,
+        target_ratios=target_ratios,
+        image_size=input_size,
+        use_thumbnail=use_thumbnail,
+    )
+
+    pixel_values = torch.stack([transform(image) for image in images])
+    return pixel_values
+
+
+class SkyworkR1VProcessor:
+    """
+    This model doesn't define its own HF processor,
+    so we implement our own one here.
+
+    The code to insert image tokens is based on:
+    https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py#L252
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: TokenizerLike,
+        *,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.tokenizer = tokenizer
+
+        image_size: int = config.vision_config.image_size
+        patch_size: int = config.vision_config.patch_size
+
+        if min_dynamic_patch is None:
+            min_dynamic_patch = config.min_dynamic_patch
+        assert isinstance(min_dynamic_patch, int)
+
+        if max_dynamic_patch is None:
+            max_dynamic_patch = config.max_dynamic_patch
+        assert isinstance(max_dynamic_patch, int)
+
+        if dynamic_image_size is None:
+            dynamic_image_size = config.dynamic_image_size
+        assert isinstance(dynamic_image_size, bool)
+
+        self.num_image_token = int(
+            (image_size // patch_size) ** 2 * (config.downsample_ratio**2)
+        )
+        self.image_size = image_size
+        self.min_dynamic_patch = min_dynamic_patch
+        self.max_dynamic_patch = max_dynamic_patch
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail: bool = config.use_thumbnail
+
+    @property
+    def image_token_id(self) -> int:
+        return self.tokenizer.get_vocab()[IMG_CONTEXT]
+
+    def get_image_repl(
+        self,
+        feature_size: int,
+        num_patches: int | None,
+    ) -> PromptUpdateDetails[str]:
+        repl_features = IMG_CONTEXT * feature_size
+        repl_full = IMG_START + repl_features + IMG_END
+
+        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
+
+    def resolve_min_max_num(
+        self,
+        *,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        use_thumbnail: bool | None = None,
+    ) -> tuple[int, int]:
+        min_dynamic_patch = (
+            self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
+        )
+        max_dynamic_patch = (
+            self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
+        )
+        dynamic_image_size = (
+            self.dynamic_image_size
+            if dynamic_image_size is None
+            else dynamic_image_size
+        )
+        use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
+
+        return resolve_skyworkr1v_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=use_thumbnail,
+        )
+
+    def resolve_target_ratios(
+        self,
+        *,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        use_thumbnail: bool | None = None,
+    ) -> list[tuple[int, int]]:
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=use_thumbnail,
+        )
+
+        return get_skyworkr1v_target_ratios(min_num, max_num)
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        target_ratios = self.resolve_target_ratios(
+            use_thumbnail=False,  # Applied in calculate_targets
+        )
+
+        num_patches, _, _ = calculate_skyworkr1v_targets(
+            orig_width=image_width,
+            orig_height=image_height,
+            image_size=self.image_size,
+            target_ratios=target_ratios,
+            use_thumbnail=self.use_thumbnail,
+        )
+
+        return num_patches * self.num_image_token
+
+    def _images_to_pixel_values_lst(
+        self,
+        images: list[Image.Image],
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+    ) -> list[torch.Tensor]:
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=False,  # Applied in image_to_pixel_values
+        )
+
+        return [
+            image_to_pixel_values_skyworkr1v(
+                image,
+                input_size=self.image_size,
+                min_num=min_num,
+                max_num=max_num,
+                use_thumbnail=self.use_thumbnail,
+            )
+            for image in images
+        ]
+
+    def __call__(
+        self,
+        text: str | list[str] | None = None,
+        images: Image.Image | list[Image.Image] | None = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        return_tensors: str | TensorType | None = None,
+    ) -> BatchFeature:
+        if text is None:
+            text = []
+        if not isinstance(text, list):
+            text = [text]
+        if images is None:
+            images = []
+        if not isinstance(images, list):
+            images = [images]
+
+        if len(images) == 0:
+            image_inputs = {}
+        else:
+            pixel_values_lst = self._images_to_pixel_values_lst(
+                images,
+                min_dynamic_patch=min_dynamic_patch,
+                max_dynamic_patch=max_dynamic_patch,
+                dynamic_image_size=dynamic_image_size,
+            )
+            image_inputs = {
+                "pixel_values_flat": torch.cat(pixel_values_lst),
+                "image_num_patches": torch.tensor(
+                    [len(item) for item in pixel_values_lst]
+                ),
+            }
+
+            for pixel_values in pixel_values_lst:
+                num_patches = pixel_values.shape[0]
+                feature_size = num_patches * self.num_image_token
+
+                image_repl = self.get_image_repl(feature_size, num_patches)
+
+                text = [t.replace("<image>", image_repl.full, 1) for t in text]
+
+        text_inputs = self.tokenizer(text)
+
+        combined_outputs = {**text_inputs, **image_inputs}
+
+        return BatchFeature(combined_outputs, tensor_type=return_tensors)