diff --git a/tests/kernels/core/test_mrope.py b/tests/kernels/core/test_mrope.py
index f12dc1865..29051b4a0 100644
--- a/tests/kernels/core/test_mrope.py
+++ b/tests/kernels/core/test_mrope.py
@@ -4,8 +4,6 @@ from typing import NamedTuple
 
 import pytest
 import torch
-from packaging.version import Version
-from transformers import __version__ as TRANSFORMERS_VERSION
 
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.platforms import current_platform
@@ -46,31 +44,13 @@ class MRoPETestInfo(NamedTuple):
     marks: list[pytest.MarkDecorator] = []
 
 
-TRANSFORMERS_BASE_VERSION = Version(TRANSFORMERS_VERSION).base_version
-
 MODELS_TO_TEST = [
     MRoPETestInfo(model_name="zai-org/GLM-4.1V-9B-Thinking"),
     MRoPETestInfo(model_name="Qwen/Qwen2-VL-7B-Instruct"),
     MRoPETestInfo(model_name="Qwen/Qwen2-VL-72B-Instruct"),
     MRoPETestInfo(model_name="Qwen/Qwen2.5-VL-72B-Instruct"),
-    MRoPETestInfo(
-        model_name="Qwen/Qwen3-VL-4B-Instruct",
-        marks=[
-            pytest.mark.skipif(
-                Version(TRANSFORMERS_BASE_VERSION) < Version("4.57.0"),
-                reason="Qwen3-VL only available after Transformers v4.57",
-            )
-        ],
-    ),
-    MRoPETestInfo(
-        model_name="Qwen/Qwen3-VL-30B-A3B-Instruct",
-        marks=[
-            pytest.mark.skipif(
-                Version(TRANSFORMERS_BASE_VERSION) < Version("4.57.0"),
-                reason="Qwen3-VL only available after Transformers v4.57",
-            )
-        ],
-    ),
+    MRoPETestInfo(model_name="Qwen/Qwen3-VL-4B-Instruct"),
+    MRoPETestInfo(model_name="Qwen/Qwen3-VL-30B-A3B-Instruct"),
 ]
 
 num_tokens_list = [11, 8192]
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index d9b7a2821..2db9c531d 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -961,12 +961,6 @@ VLM_TEST_SETTINGS = {
                 limit_mm_per_prompt={"image": 4},
             )
         ],
-        marks=[
-            pytest.mark.skipif(
-                Version(TRANSFORMERS_VERSION) == Version("4.57.1"),
-                reason="This model is broken in Transformers v4.57.1",
-            )
-        ],
     ),
     # regression test for https://github.com/vllm-project/vllm/issues/15122
     "qwen2_5_vl-windows-attention": VLMTestInfo(
diff --git a/tests/models/multimodal/processing/test_gemma3.py b/tests/models/multimodal/processing/test_gemma3.py
index 5a3271e07..a9c259c89 100644
--- a/tests/models/multimodal/processing/test_gemma3.py
+++ b/tests/models/multimodal/processing/test_gemma3.py
@@ -168,6 +168,7 @@ def test_get_image_size_with_most_features(
         image_width=max_image_size.width,
         image_height=max_image_size.height,
         processor=hf_processor,
+        mm_kwargs=hf_processor_mm_kwargs,
     )
 
     prompt = "<start_of_image>"
diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py
index d88d37f0b..342075ccc 100644
--- a/tests/models/multimodal/processing/test_idefics3.py
+++ b/tests/models/multimodal/processing/test_idefics3.py
@@ -3,7 +3,9 @@
 """Tests for Idefics3's multimodal preprocessing kwargs."""
 
 import pytest
+from packaging.version import Version
 from transformers import Idefics3Config
+from transformers import __version__ as TRANSFORMERS_VERSION
 
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
@@ -11,6 +13,10 @@ from ....conftest import ImageTestAssets
 from ...utils import build_model_context
 
 
+@pytest.mark.skipif(
+    Version(TRANSFORMERS_VERSION) < Version("5.2.0"),
+    reason="See https://github.com/huggingface/transformers/pull/43948",
+)
 @pytest.mark.parametrize("model_id", ["HuggingFaceM4/Idefics3-8B-Llama3"])
 @pytest.mark.parametrize(
     ("mm_processor_kwargs", "expected_toks_per_img"),
@@ -63,7 +69,11 @@ def test_processor_override(
 
     # Ensure the placeholders format are correct
     hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
-    hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
+    hf_processed_inputs = hf_processor(
+        text=prompt,
+        images=mm_data["image"],
+        **processor.info.ctx.get_merged_mm_kwargs(hf_processor_mm_kwargs),
+    )
     assert processed_inputs["prompt_token_ids"] == hf_processed_inputs["input_ids"][0]
 
     # Ensure we have the right number of placeholders per num_crops size
diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py
index a0ecce5d8..11f9ac232 100644
--- a/tests/models/multimodal/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
@@ -82,6 +82,7 @@ def test_get_image_size_with_most_features(
         image_width=max_image_size.width,
         image_height=max_image_size.height,
         image_processor=hf_processor.image_processor,
+        mm_kwargs=hf_processor_mm_kwargs,
     )
 
     prompt = "<|vision_start|><|image_pad|><|vision_end|>"
diff --git a/tests/models/multimodal/processing/test_smolvlm.py b/tests/models/multimodal/processing/test_smolvlm.py
index 102563154..e8ae56efd 100644
--- a/tests/models/multimodal/processing/test_smolvlm.py
+++ b/tests/models/multimodal/processing/test_smolvlm.py
@@ -3,7 +3,9 @@
 """Tests for smolvlm's multimodal preprocessing kwargs."""
 
 import pytest
+from packaging.version import Version
 from transformers import SmolVLMConfig
+from transformers import __version__ as TRANSFORMERS_VERSION
 
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
@@ -11,6 +13,10 @@ from ....conftest import ImageTestAssets
 from ...utils import build_model_context
 
 
+@pytest.mark.skipif(
+    Version(TRANSFORMERS_VERSION) < Version("5.2.0"),
+    reason="See https://github.com/huggingface/transformers/pull/43948",
+)
 @pytest.mark.parametrize("model_id", ["HuggingFaceTB/SmolVLM2-2.2B-Instruct"])
 @pytest.mark.parametrize(
     ("mm_processor_kwargs", "expected_toks_per_img"),
@@ -63,7 +69,11 @@ def test_processor_override(
 
     # Ensure the placeholders format are correct
     hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
-    hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
+    hf_processed_inputs = hf_processor(
+        text=prompt,
+        images=mm_data["image"],
+        **processor.info.ctx.get_merged_mm_kwargs(hf_processor_mm_kwargs),
+    )
     assert processed_inputs["prompt_token_ids"] == hf_processed_inputs["input_ids"][0]
 
     # Ensure we have the right number of placeholders per num_crops size
diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py
index 4aefd2ead..1bcdd41b3 100644
--- a/vllm/model_executor/models/cohere2_vision.py
+++ b/vllm/model_executor/models/cohere2_vision.py
@@ -11,7 +11,7 @@ from torch import nn
 from transformers import BatchFeature, PretrainedConfig
 from transformers.models.cohere2_vision import Cohere2VisionConfig
 from transformers.models.cohere2_vision.image_processing_cohere2_vision_fast import (  # noqa: E501
-    get_optimal_tiled_canvas,
+    Cohere2VisionImageProcessorFast,
 )
 from transformers.models.cohere2_vision.processing_cohere2_vision import (
     Cohere2VisionProcessor,
@@ -166,43 +166,20 @@ class Cohere2VisionProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Cohere2VisionProcessor | None,
+        processor: Cohere2VisionProcessor,
+        mm_kwargs: Mapping[str, object],
     ) -> int:
         """
         Calculate the number of image patches for a given image.
         Uses the HF processor to determine the actual number of patches.
         """
-        if processor is None:
-            processor = self.get_hf_processor()
+        image_processor: Cohere2VisionImageProcessorFast = processor.image_processor
 
-        image_processor = processor.image_processor
-
-        # The current implementation of get_number_of_image_patches
-        # is incorrect, so we patch it here.
-        # TODO: Revert once
-        # https://github.com/huggingface/transformers/pull/40312 is released.
-        # return image_processor.get_number_of_image_patches(image_height,
-        #                                                    image_width, {})
-
-        min_patches = image_processor.min_patches
-        max_patches = image_processor.max_patches
-        patch_size = image_processor.size
-        crop_to_patches = image_processor.crop_to_patches
-
-        if not crop_to_patches:
-            return 1
-
-        num_columns, num_rows = get_optimal_tiled_canvas(
-            (image_height, image_width),
-            (patch_size["height"], patch_size["width"]),
-            min_patches,
-            max_patches,
+        return image_processor.get_number_of_image_patches(
+            image_height,
+            image_width,
+            self.ctx.get_merged_mm_kwargs(mm_kwargs),
         )
-        num_patches = num_columns * num_rows
-        if num_patches > 1:
-            num_patches += 1  # Thumbnail image
-
-        return num_patches
 
 
 class Cohere2VisionDummyInputsBuilder(
@@ -271,6 +248,7 @@ class Cohere2VisionMultiModalProcessor(
                     image_width=parsed_images.get_image_size(i).width,
                     image_height=parsed_images.get_image_size(i).height,
                     processor=hf_processor,
+                    mm_kwargs=mm_kwargs,
                 )
                 for i in range(len(parsed_images))
             ]
@@ -311,6 +289,7 @@ class Cohere2VisionMultiModalProcessor(
                 image_width=image_size.width,
                 image_height=image_size.height,
                 processor=hf_processor,
+                mm_kwargs=hf_processor_mm_kwargs,
             )
             patch_tokens = image_token * img_tokens_per_tile + img_line_break_token
             repl = f"{boi_token}{patch_tokens * num_patches}{eoi_token}"
diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py
index 50d3954b6..37e95b261 100644
--- a/vllm/model_executor/models/ernie45_vl.py
+++ b/vllm/model_executor/models/ernie45_vl.py
@@ -34,7 +34,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
-from transformers import BatchFeature
+from transformers import BaseImageProcessor, BatchFeature
 
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
@@ -818,10 +818,9 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo):
         image_height: int,
         num_frames: int = 1,
         do_resize: bool = True,
-        image_processor: Any | None,
+        image_processor: BaseImageProcessor,
+        mm_kwargs: Mapping[str, object],
     ) -> tuple[ImageSize, int]:
-        if image_processor is None:
-            image_processor = self.get_image_processor()
         hf_config = self.get_hf_config()
         vision_config = hf_config.vision_config
 
@@ -829,13 +828,16 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo):
         spatial_conv_size = hf_config.spatial_conv_size
         temporal_conv_size = hf_config.temporal_conv_size
 
+        mm_kwargs = self.ctx.get_merged_mm_kwargs(mm_kwargs)
+        size = mm_kwargs.get("size", image_processor.size)
+
         if do_resize:
             resized_height, resized_width = smart_resize(
                 height=image_height,
                 width=image_width,
                 factor=patch_size * spatial_conv_size,
-                min_pixels=image_processor.min_pixels,
-                max_pixels=image_processor.max_pixels,
+                min_pixels=size["min_pixels"],
+                max_pixels=size["max_pixels"],
             )
             preprocessed_size = ImageSize(width=resized_width, height=resized_height)
         else:
@@ -855,12 +857,14 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        image_processor: Any | None,
+        image_processor: BaseImageProcessor,
+        mm_kwargs: Mapping[str, object],
     ) -> int:
         _, num_image_tokens = self._get_vision_info(
             image_width=image_width,
             image_height=image_height,
             image_processor=image_processor,
+            mm_kwargs=mm_kwargs,
         )
         return num_image_tokens
 
@@ -870,35 +874,43 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo):
         image_width: int,
         image_height: int,
         num_frames: int,
-        image_processor: Any | None,
+        image_processor: BaseImageProcessor,
+        mm_kwargs: Mapping[str, object],
     ) -> int:
         _, num_video_tokens = self._get_vision_info(
             image_width=image_width,
             image_height=image_height,
             num_frames=num_frames,
             image_processor=image_processor,
+            mm_kwargs=mm_kwargs,
         )
         return num_video_tokens
 
     def get_image_size_with_most_features(self) -> ImageSize:
+        image_processor = self.get_image_processor()
+
         max_image_size, _ = self._get_vision_info(
             image_width=9999999,
             image_height=9999999,
-            image_processor=None,
+            image_processor=image_processor,
+            mm_kwargs={},
         )
         return max_image_size
 
     def get_max_image_tokens(self) -> int:
+        image_processor = self.get_image_processor()
         target_width, target_height = self.get_image_size_with_most_features()
 
         num_image_tokens = self.get_num_image_tokens(
             image_width=target_width,
             image_height=target_height,
-            image_processor=None,
+            image_processor=image_processor,
+            mm_kwargs={},
         )
         return num_image_tokens
 
     def _get_max_video_frames(self, max_tokens: int) -> int:
+        image_processor = self.get_image_processor()
         target_width, target_height = self.get_image_size_with_most_features()
 
         num_frames = 0
@@ -909,7 +921,8 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo):
                 image_width=target_width,
                 image_height=target_height,
                 num_frames=next_num_frames,
-                image_processor=None,
+                image_processor=image_processor,
+                mm_kwargs={},
             )
 
             if next_max_tokens > max_tokens:
@@ -942,13 +955,15 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo):
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> int:
+        image_processor = self.get_image_processor()
         target_width, target_height = self.get_image_size_with_most_features()
 
         return self.get_num_video_tokens(
             image_width=target_width,
             image_height=target_height,
             num_frames=self.get_num_frames_with_most_features(seq_len, mm_counts),
-            image_processor=None,
+            image_processor=image_processor,
+            mm_kwargs={},
         )
 
 
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 1e803f89b..d0a326ccd 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -7,6 +7,7 @@ from typing import Annotated, Any, Literal
 import torch
 from torch import nn
 from transformers import BatchFeature, Gemma3Config, Gemma3Processor
+from transformers.models.gemma3.image_processing_gemma3 import Gemma3ImageProcessor
 from transformers.models.gemma3.processing_gemma3 import Gemma3ProcessorKwargs
 
 from vllm.config import VllmConfig
@@ -84,54 +85,35 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
-    def _resolve_image_kwargs(
-        self,
-        processor: Gemma3Processor,
-        keys: set[str],
-    ) -> dict[str, Any]:
-        image_processor = processor.image_processor
-        kwargs = processor._merge_kwargs(
-            Gemma3ProcessorKwargs,
-            tokenizer_init_kwargs=processor.tokenizer.init_kwargs,
-        )
-
-        images_kwargs = kwargs["images_kwargs"]
-
-        def _resolve_kw(key: str):
-            val = getattr(image_processor, key)
-            if val is None:
-                val = images_kwargs[key]
-
-            return val
-
-        return {k: _resolve_kw(k) for k in keys}
-
     def get_num_crops(
         self,
         *,
         image_width: int,
         image_height: int,
-        processor: Gemma3Processor | None,
+        processor: Gemma3Processor,
+        mm_kwargs: Mapping[str, object],
     ) -> int:
-        if processor is None:
-            processor = self.get_hf_processor()
+        image_processor: Gemma3ImageProcessor = processor.image_processor
 
-        images_kwargs = self._resolve_image_kwargs(
-            processor,
-            {
-                "do_pan_and_scan",
-                "pan_and_scan_min_crop_size",
-                "pan_and_scan_max_num_crops",
-                "pan_and_scan_min_ratio_to_activate",
-            },
+        images_kwargs = processor._merge_kwargs(
+            Gemma3ProcessorKwargs,
+            tokenizer_init_kwargs=processor.tokenizer.init_kwargs,
+            **self.ctx.get_merged_mm_kwargs(mm_kwargs),
+        )["images_kwargs"]
+
+        do_pan_and_scan = images_kwargs.get(
+            "do_pan_and_scan", image_processor.do_pan_and_scan
+        )
+        pan_and_scan_min_crop_size = images_kwargs.get(
+            "pan_and_scan_min_crop_size", image_processor.pan_and_scan_min_crop_size
+        )
+        pan_and_scan_max_num_crops = images_kwargs.get(
+            "pan_and_scan_max_num_crops", image_processor.pan_and_scan_max_num_crops
+        )
+        pan_and_scan_min_ratio_to_activate = images_kwargs.get(
+            "pan_and_scan_min_ratio_to_activate",
+            image_processor.pan_and_scan_min_ratio_to_activate,
         )
-
-        do_pan_and_scan = images_kwargs["do_pan_and_scan"]
-        pan_and_scan_min_crop_size = images_kwargs["pan_and_scan_min_crop_size"]
-        pan_and_scan_max_num_crops = images_kwargs["pan_and_scan_max_num_crops"]
-        pan_and_scan_min_ratio_to_activate = images_kwargs[
-            "pan_and_scan_min_ratio_to_activate"
-        ]
 
         if not do_pan_and_scan:
             return 0
@@ -180,17 +162,16 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Gemma3Processor | None,
+        processor: Gemma3Processor,
+        mm_kwargs: Mapping[str, object],
     ) -> PromptUpdateDetails[str]:
-        if processor is None:
-            processor = self.get_hf_processor()
-
         boi_token = processor.boi_token
 
         num_crops = self.get_num_crops(
             image_width=image_width,
             image_height=image_height,
             processor=processor,
+            mm_kwargs=mm_kwargs,
         )
 
         if num_crops == 0:
@@ -215,15 +196,14 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Gemma3Processor | None,
+        processor: Gemma3Processor,
+        mm_kwargs: Mapping[str, object],
     ) -> int:
-        if processor is None:
-            processor = self.get_hf_processor()
-
         num_crops = self.get_num_crops(
             image_width=image_width,
             image_height=image_height,
             processor=processor,
+            mm_kwargs=mm_kwargs,
         )
         image_seq_len = processor.image_seq_length
 
@@ -231,11 +211,17 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
 
     def get_image_size_with_most_features(self) -> ImageSize:
         processor = self.get_hf_processor()
+        image_processor: Gemma3ImageProcessor = processor.image_processor
 
-        images_kwargs = self._resolve_image_kwargs(
-            processor, {"pan_and_scan_max_num_crops"}
+        images_kwargs = processor._merge_kwargs(
+            Gemma3ProcessorKwargs,
+            tokenizer_init_kwargs=processor.tokenizer.init_kwargs,
+            **self.ctx.get_merged_mm_kwargs({}),
+        )["images_kwargs"]
+
+        max_num_crops = images_kwargs.get(
+            "pan_and_scan_max_num_crops", image_processor.pan_and_scan_max_num_crops
         )
-        max_num_crops = images_kwargs["pan_and_scan_max_num_crops"]
 
         vision_config = self.get_hf_config().vision_config
         native_size = vision_config.image_size
@@ -303,6 +289,7 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
                     image_width=size.width,
                     image_height=size.height,
                     processor=hf_processor,
+                    mm_kwargs=mm_kwargs,
                 )
                 for size in image_sizes
             ]
@@ -339,6 +326,7 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
                 image_width=image_size.width,
                 image_height=image_size.height,
                 processor=hf_processor,
+                mm_kwargs=hf_processor_mm_kwargs,
             )
 
         return [
diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py
index 8588e51f5..3e4745f7c 100644
--- a/vllm/model_executor/models/gemma3n_mm.py
+++ b/vllm/model_executor/models/gemma3n_mm.py
@@ -131,7 +131,7 @@ class Gemma3nProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Gemma3nProcessor | None,
+        processor: Gemma3nProcessor,
     ) -> str:
         """
         Get the replacement text for image tokens.
@@ -139,9 +139,6 @@ class Gemma3nProcessingInfo(BaseProcessingInfo):
         For Gemma3n, this should return the full_image_sequence which includes
         BOI token, repeated image tokens, and EOI token.
         """
-        if processor is None:
-            processor = self.get_hf_processor()
-
         return PromptUpdateDetails.select_token_id(
             processor.full_image_sequence, processor.image_token_id
         )
@@ -149,7 +146,7 @@ class Gemma3nProcessingInfo(BaseProcessingInfo):
     def get_audio_repl(
         self,
         *,
-        processor: Gemma3nProcessor | None,
+        processor: Gemma3nProcessor,
     ) -> str:
         """
         Get the replacement text for audio tokens.
@@ -157,9 +154,6 @@ class Gemma3nProcessingInfo(BaseProcessingInfo):
         For Gemma3n, this should return the full_audio_sequence which includes
         BOA token, repeated audio tokens, and EOA token.
         """
-        if processor is None:
-            processor = self.get_hf_processor()
-
         # Return the full audio sequence as defined by the processor
         return PromptUpdateDetails.select_token_id(
             processor.full_audio_sequence, processor.audio_token_id
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index 90b495e0d..ea25f884f 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -424,12 +424,9 @@ class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: H2OVLProcessor | None,
+        processor: H2OVLProcessor,
         use_msac: bool | None = None,
     ) -> int:
-        if processor is None:
-            processor = self.get_hf_processor()
-
         return processor.get_num_image_tokens(
             image_width=image_width,
             image_height=image_height,
diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py
index edd00c5cd..50b6bd427 100644
--- a/vllm/model_executor/models/hunyuan_vision.py
+++ b/vllm/model_executor/models/hunyuan_vision.py
@@ -78,7 +78,10 @@ from vllm.transformers_utils.configs.hunyuan_vl import (
     HunYuanVLVisionConfig,
 )
 from vllm.transformers_utils.processors.hunyuan_vl import HunYuanVLProcessor
-from vllm.transformers_utils.processors.hunyuan_vl_image import smart_resize
+from vllm.transformers_utils.processors.hunyuan_vl_image import (
+    HunYuanVLImageProcessor,
+    smart_resize,
+)
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (
@@ -596,7 +599,7 @@ class HunYuanVLProcessingInfo(BaseProcessingInfo):
     def get_image_processor(
         self,
         **kwargs: object,
-    ) -> HunYuanVLProcessor:
+    ) -> HunYuanVLImageProcessor:
         return self.get_hf_processor(**kwargs).image_processor
 
     def get_data_parser(self):
@@ -624,23 +627,24 @@ class HunYuanVLProcessingInfo(BaseProcessingInfo):
         image_height: int,
         num_frames: int = 1,
         do_resize: bool = True,
-        image_processor: HunYuanVLProcessor | None,
+        image_processor: HunYuanVLImageProcessor,
+        mm_kwargs: Mapping[str, object],
     ) -> tuple[ImageSize, int]:
-        if image_processor is None:
-            image_processor = self.get_image_processor()
-
         hf_config = self.get_hf_config()
         vision_config = hf_config.vision_config
         patch_size = vision_config.patch_size
         spatial_merge_size = vision_config.spatial_merge_size
 
+        mm_kwargs = self.ctx.get_merged_mm_kwargs(mm_kwargs)
+        size = mm_kwargs.get("size", image_processor.size)
+
         if do_resize:
             resized_height, resized_width = smart_resize(
                 height=image_height,
                 width=image_width,
                 factor=patch_size * spatial_merge_size,
-                min_pixels=image_processor.min_pixels,
-                max_pixels=image_processor.max_pixels,
+                min_pixels=size["shortest_edge"],
+                max_pixels=size["longest_edge"],
             )
             preprocessed_size = ImageSize(width=resized_width, height=resized_height)
         else:
@@ -662,29 +666,37 @@ class HunYuanVLProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        image_processor: HunYuanVLProcessor | None,
+        image_processor: HunYuanVLImageProcessor,
+        mm_kwargs: Mapping[str, object],
     ) -> int:
         _, num_image_tokens = self._get_vision_info(
             image_width=image_width,
             image_height=image_height,
             image_processor=image_processor,
+            mm_kwargs=mm_kwargs,
         )
         return num_image_tokens
 
     def get_image_size_with_most_features(self) -> ImageSize:
+        image_processor = self.get_image_processor()
+
         max_image_size, _ = self._get_vision_info(
             image_width=512,
             image_height=8192,
-            image_processor=None,
+            image_processor=image_processor,
+            mm_kwargs={},
         )
         return max_image_size
 
     def get_max_image_tokens(self) -> int:
+        image_processor = self.get_image_processor()
         target_width, target_height = self.get_image_size_with_most_features()
+
         return self.get_num_image_tokens(
             image_width=target_width,
             image_height=target_height,
-            image_processor=None,
+            image_processor=image_processor,
+            mm_kwargs={},
         )
 
 
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index e2cfd1d63..434bc7318 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -16,7 +16,6 @@
 # limitations under the License.
 """Inference-only Idefics3 model compatible with HuggingFace weights."""
 
-import math
 from collections.abc import Iterable, Mapping, Sequence
 from typing import Annotated, Literal, TypeAlias
 
@@ -168,54 +167,35 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Idefics3Processor | None,
-    ) -> tuple[int, int]:
-        if processor is None:
-            processor = self.get_hf_processor()
-
+        processor: Idefics3Processor,
+        mm_kwargs: Mapping[str, object],
+    ) -> tuple[int, int, int]:
         image_processor: Idefics3ImageProcessor = processor.image_processor
 
-        max_image_size = image_processor.max_image_size["longest_edge"]
-        size = image_processor.size["longest_edge"]
-        assert size % max_image_size == 0, (
-            "`longest_edge` in image_processor's `size` must be divisible by "
-            "`longest_edge` in `max_image_size`, this may be caused by "
-            "incorrect mm_kwargs override."
+        return image_processor.get_number_of_image_patches(
+            image_height,
+            image_width,
+            self.ctx.get_merged_mm_kwargs(mm_kwargs),
         )
 
-        resized_height, resized_width = self._get_resize_output_image_size(
-            image_width=image_width,
-            image_height=image_height,
-            resolution_max_side=size,
-        )
-        if resized_height > max_image_size or resized_width > max_image_size:
-            grid_h = math.ceil(resized_height / max_image_size)
-            grid_w = math.ceil(resized_width / max_image_size)
-        else:
-            grid_h = grid_w = 0
-        return grid_w, grid_h
-
     def get_num_patches(
         self,
         *,
         image_width: int,
         image_height: int,
-        processor: Idefics3Processor | None,
+        processor: Idefics3Processor,
+        mm_kwargs: Mapping[str, object],
     ) -> int:
-        grid_w, grid_h = self._get_image_feature_grid_size(
+        num_patches, _, _ = self._get_image_feature_grid_size(
             image_width=image_width,
             image_height=image_height,
             processor=processor,
+            mm_kwargs=mm_kwargs,
         )
 
-        return grid_w * grid_h + 1
-
-    def _get_image_token(
-        self, processor: Idefics3Processor | None
-    ) -> tuple[str, str, str]:
-        if processor is None:
-            processor = self.get_hf_processor()
+        return num_patches
 
+    def _get_image_token(self, processor: Idefics3Processor) -> tuple[str, str, str]:
         image_token = processor.image_token
         fake_image_token = processor.fake_image_token
         global_image_token = processor.global_image_tag
@@ -226,11 +206,9 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Idefics3Processor | None,
+        processor: Idefics3Processor,
+        mm_kwargs: Mapping[str, object],
     ) -> str:
-        if processor is None:
-            processor = self.get_hf_processor()
-
         image_token, fake_image_token, global_img_token = self._get_image_token(
             processor
         )
@@ -241,10 +219,11 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
         global_img_placeholder = fake_image_token + global_img_token + p_img
         tile_img_placeholder = fake_image_token + grid_placeholder + p_img
 
-        grid_w, grid_h = self._get_image_feature_grid_size(
+        _, grid_h, grid_w = self._get_image_feature_grid_size(
             image_width=image_width,
             image_height=image_height,
             processor=processor,
+            mm_kwargs=mm_kwargs,
         )
         if grid_w == 0 and grid_h == 0:
             return global_img_placeholder + fake_image_token
@@ -272,15 +251,14 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Idefics3Processor | None,
+        processor: Idefics3Processor,
+        mm_kwargs: Mapping[str, object],
     ) -> int:
-        if processor is None:
-            processor = self.get_hf_processor()
-
         num_patches = self.get_num_patches(
             image_width=image_width,
             image_height=image_height,
             processor=processor,
+            mm_kwargs=mm_kwargs,
         )
 
         return num_patches * processor.image_seq_len
@@ -353,6 +331,7 @@ class Idefics3MultiModalProcessor(BaseMultiModalProcessor[Idefics3ProcessingInfo
                 image_width=size.width,
                 image_height=size.height,
                 processor=hf_processor,
+                mm_kwargs=mm_kwargs,
             )
             for size in image_sizes
         ]
@@ -398,6 +377,7 @@ class Idefics3MultiModalProcessor(BaseMultiModalProcessor[Idefics3ProcessingInfo
                 image_width=image_size.width,
                 image_height=image_size.height,
                 processor=hf_processor,
+                mm_kwargs=hf_processor_mm_kwargs,
             )
 
             return PromptUpdateDetails.select_text(
diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py
index dd1332dfd..5e973aa83 100644
--- a/vllm/model_executor/models/interns1.py
+++ b/vllm/model_executor/models/interns1.py
@@ -197,20 +197,18 @@ class InternS1ProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: GotOcr2ImageProcessorFast | None = None,
+        processor: InternVLProcessor,
+        mm_kwargs: Mapping[str, object],
     ) -> int:
-        if processor is None:
-            processor = self.get_hf_processor().image_processor
+        image_processor: GotOcr2ImageProcessorFast = processor.image_processor
 
-        if not isinstance(processor, GotOcr2ImageProcessorFast):
-            raise ValueError(
-                f"GotOcr2ImageProcessorFast is expected but got {type(processor)}"
-            )
-        num_image_patches = processor.get_number_of_image_patches(
-            image_height, image_width, images_kwargs=dict()
+        num_image_patches = image_processor.get_number_of_image_patches(
+            image_height,
+            image_width,
+            self.ctx.get_merged_mm_kwargs(mm_kwargs),
         )
-        num_image_tokens = self.get_hf_processor().image_seq_length * num_image_patches
-        return num_image_tokens
+
+        return processor.image_seq_length * num_image_patches
 
     def resolve_target_ratios(self, use_thumbnail: bool | None = None):
         image_processor = self.get_hf_processor().image_processor
@@ -243,7 +241,8 @@ class InternS1ProcessingInfo(BaseProcessingInfo):
             feat_size = self.get_num_image_tokens(
                 image_width=width,
                 image_height=height,
-                processor=processor.image_processor,
+                processor=processor,
+                mm_kwargs={},
             )
             if feat_size > largest_feature_size:
                 largest_feature_size = feat_size
@@ -262,7 +261,8 @@ class InternS1ProcessingInfo(BaseProcessingInfo):
         return self.get_num_image_tokens(
             image_width=target_width,
             image_height=target_height,
-            processor=processor.image_processor,
+            processor=processor,
+            mm_kwargs={},
         )
 
     def get_num_frames_with_most_features(
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 334ee3cbe..7fbbb7237 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -705,11 +705,8 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: BaseInternVLProcessor | None,
+        processor: BaseInternVLProcessor,
     ) -> int:
-        if processor is None:
-            processor = self.get_hf_processor()
-
         return processor.get_num_image_tokens(
             image_width=image_width,
             image_height=image_height,
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index 960915af6..2ae044c28 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -10,7 +10,7 @@ import numpy as np
 import torch
 import torch.nn as nn
 from einops import rearrange
-from transformers import PretrainedConfig
+from transformers import BaseImageProcessor, PretrainedConfig
 from transformers.activations import GELUActivation
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
@@ -1011,24 +1011,25 @@ class KeyeProcessingInfo(BaseProcessingInfo):
         image_height: int,
         num_frames: int = 1,
         do_resize: bool = True,
-        image_processor,
+        image_processor: BaseImageProcessor,
+        mm_kwargs: Mapping[str, object],
     ) -> tuple[ImageSize, int]:
-        if image_processor is None:
-            image_processor = self.get_image_processor()
-
         hf_config = self.get_hf_config()
         vision_config = hf_config.vision_config
         patch_size = vision_config.patch_size
         merge_size = vision_config.spatial_merge_size
         temporal_patch_size = 1
 
+        mm_kwargs = self.ctx.get_merged_mm_kwargs(mm_kwargs)
+        size = mm_kwargs.get("size", image_processor.size)
+
         if do_resize:
             resized_height, resized_width = smart_resize(
                 height=image_height,
                 width=image_width,
                 factor=patch_size * merge_size,
-                min_pixels=image_processor.min_pixels,
-                max_pixels=image_processor.max_pixels,
+                min_pixels=size["min_pixels"],
+                max_pixels=size["max_pixels"],
             )
             preprocessed_size = ImageSize(width=resized_width, height=resized_height)
         else:
@@ -1050,12 +1051,14 @@ class KeyeProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        image_processor,
+        image_processor: BaseImageProcessor,
+        mm_kwargs: Mapping[str, object],
     ) -> int:
         _, num_image_tokens = self._get_vision_info(
             image_width=image_width,
             image_height=image_height,
             image_processor=image_processor,
+            mm_kwargs=mm_kwargs,
         )
         return num_image_tokens
 
@@ -1065,36 +1068,42 @@ class KeyeProcessingInfo(BaseProcessingInfo):
         image_width: int,
         image_height: int,
         num_frames: int,
-        image_processor,
+        image_processor: BaseImageProcessor,
+        mm_kwargs: Mapping[str, object],
     ) -> int:
         _, num_video_tokens = self._get_vision_info(
             image_width=image_width,
             image_height=image_height,
             num_frames=num_frames,
             image_processor=image_processor,
+            mm_kwargs=mm_kwargs,
         )
         return num_video_tokens
 
-    def get_image_size_with_most_features(
-        self,
-    ) -> ImageSize:
+    def get_image_size_with_most_features(self) -> ImageSize:
+        image_processor = self.get_image_processor()
+
         max_image_size, _ = self._get_vision_info(
             image_width=self.get_max_image_size(),
             image_height=self.get_max_image_size(),
-            image_processor=None,
+            image_processor=image_processor,
+            mm_kwargs={},
         )
         return max_image_size
 
     def get_max_image_tokens(self) -> int:
+        image_processor = self.get_image_processor()
         target_width, target_height = self.get_image_size_with_most_features()
 
         return self.get_num_image_tokens(
             image_width=target_width,
             image_height=target_height,
-            image_processor=None,
+            image_processor=image_processor,
+            mm_kwargs={},
         )
 
     def _get_max_video_frames(self, max_tokens: int) -> int:
+        image_processor = self.get_image_processor()
         target_width, target_height = self.get_image_size_with_most_features()
 
         num_frames = 0
@@ -1105,7 +1114,8 @@ class KeyeProcessingInfo(BaseProcessingInfo):
                 image_width=target_width,
                 image_height=target_height,
                 num_frames=next_num_frames,
-                image_processor=None,
+                image_processor=image_processor,
+                mm_kwargs={},
             )
 
             if next_max_tokens > max_tokens:
@@ -1130,13 +1140,15 @@ class KeyeProcessingInfo(BaseProcessingInfo):
         return max(max_frames_per_video, 1)
 
     def get_max_video_tokens(self, seq_len: int) -> int:
+        image_processor = self.get_image_processor()
         target_width, target_height = self.get_image_size_with_most_features()
 
         return self.get_num_video_tokens(
             image_width=target_width,
             image_height=target_height,
             num_frames=self.get_num_frames_with_most_features(seq_len),
-            image_processor=None,
+            image_processor=image_processor,
+            mm_kwargs={},
         )
 
 
diff --git a/vllm/model_executor/models/lfm2_vl.py b/vllm/model_executor/models/lfm2_vl.py
index b77b93196..98fd0b1b0 100644
--- a/vllm/model_executor/models/lfm2_vl.py
+++ b/vllm/model_executor/models/lfm2_vl.py
@@ -176,7 +176,7 @@ class Lfm2VLProcessingInfo(BaseProcessingInfo):
         min_tiles: int,
         max_tiles: int,
         tile_size: int,
-    ) -> tuple[int, int]:
+    ) -> tuple[int, int, int]:
         aspect_ratio = width / height
         target_ratios = self._target_ratios(min_tiles, max_tiles)
         # find best matching grid configuration
@@ -190,18 +190,27 @@ class Lfm2VLProcessingInfo(BaseProcessingInfo):
         self,
         image_width: int,
         image_height: int,
-        processor: Lfm2VlProcessor | None,
-    ) -> tuple[int, int]:
-        if processor is None:
-            processor = self.get_image_processor()
+        processor: Lfm2VlProcessor,
+        mm_kwargs: Mapping[str, object],
+    ) -> tuple[int, int, int]:
+        image_processor: Lfm2VlImageProcessorFast = processor.image_processor
 
-        downsample_factor = processor.image_processor.downsample_factor
-        encoder_patch_size = processor.image_processor.encoder_patch_size
-        max_pixels_tolerance = processor.image_processor.max_pixels_tolerance
-        min_tiles = processor.image_processor.min_tiles
-        max_tiles = processor.image_processor.max_tiles
-        max_image_tokens = processor.image_processor.max_image_tokens
-        tile_size = processor.image_processor.tile_size
+        mm_kwargs = self.ctx.get_merged_mm_kwargs(mm_kwargs)
+        downsample_factor = mm_kwargs.get(
+            "downsample_factor", image_processor.downsample_factor
+        )
+        encoder_patch_size = mm_kwargs.get(
+            "encoder_patch_size", image_processor.encoder_patch_size
+        )
+        max_pixels_tolerance = mm_kwargs.get(
+            "max_pixels_tolerance", image_processor.max_pixels_tolerance
+        )
+        min_tiles = mm_kwargs.get("min_tiles", image_processor.min_tiles)
+        max_tiles = mm_kwargs.get("max_tiles", image_processor.max_tiles)
+        max_image_tokens = mm_kwargs.get(
+            "max_image_tokens", image_processor.max_image_tokens
+        )
+        tile_size = mm_kwargs.get("tile_size", image_processor.tile_size)
 
         do_image_splitting = not min_tiles == max_tiles == 1
         is_image_large = self._is_image_too_large(
@@ -235,12 +244,14 @@ class Lfm2VLProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Lfm2VlProcessor | None,
+        processor: Lfm2VlProcessor,
+        mm_kwargs: Mapping[str, object],
     ) -> int:
         _, _, total_patches = self._get_image_feature_grid_size(
             image_width=image_width,
             image_height=image_height,
             processor=processor,
+            mm_kwargs=mm_kwargs,
         )
         return total_patches
 
@@ -249,11 +260,9 @@ class Lfm2VLProcessingInfo(BaseProcessingInfo):
         image_width: int,
         image_height: int,
         spatial_shapes: torch.Tensor,
-        processor: Lfm2VlProcessor | None,
+        processor: Lfm2VlProcessor,
+        mm_kwargs: Mapping[str, object],
     ) -> str:
-        if processor is None:
-            processor = self.get_hf_processor()
-
         grid_placeholder = "<|img_row_{n_h}_col_{n_w}|>"
         image_token = processor.image_token
         image_start_token = processor.image_start_token
@@ -263,6 +272,7 @@ class Lfm2VLProcessingInfo(BaseProcessingInfo):
         num_thumbnail_tokens, num_tokens_per_tile = self.get_num_image_tokens(
             spatial_shapes=spatial_shapes,
             processor=processor,
+            mm_kwargs=mm_kwargs,
         )
         tile_img_placeholder = grid_placeholder + (image_token * num_tokens_per_tile)
 
@@ -270,6 +280,7 @@ class Lfm2VLProcessingInfo(BaseProcessingInfo):
             image_width=image_width,
             image_height=image_height,
             processor=processor,
+            mm_kwargs=mm_kwargs,
         )
 
         if grid_w > 1 or grid_h > 1:
@@ -295,15 +306,25 @@ class Lfm2VLProcessingInfo(BaseProcessingInfo):
         self,
         *,
         spatial_shapes: torch.Tensor,
-        processor: Lfm2VlProcessor | None,
+        processor: Lfm2VlProcessor,
+        mm_kwargs: Mapping[str, object],
     ) -> tuple[int, int]:
-        tile_size = processor.image_processor.tile_size
-        downsample_factor = processor.image_processor.downsample_factor
-        encoder_patch_size = processor.image_processor.encoder_patch_size
+        image_processor: Lfm2VlImageProcessorFast = processor.image_processor
+
+        mm_kwargs = self.ctx.get_merged_mm_kwargs(mm_kwargs)
+        downsample_factor = mm_kwargs.get(
+            "downsample_factor", image_processor.downsample_factor
+        )
+        encoder_patch_size = mm_kwargs.get(
+            "encoder_patch_size", image_processor.encoder_patch_size
+        )
+        tile_size = mm_kwargs.get("tile_size", image_processor.tile_size)
+
         num_thumbnail_tokens = spatial_shapes[-1].prod() // (downsample_factor**2)
         num_patches_tile = tile_size // encoder_patch_size
         dwn_num_patches_tile = math.ceil(num_patches_tile / downsample_factor)
         num_tiles_tokens = dwn_num_patches_tile * dwn_num_patches_tile
+
         return num_thumbnail_tokens, num_tiles_tokens
 
 
@@ -372,6 +393,7 @@ class Lfm2VLMultiModalProcessor(BaseMultiModalProcessor[Lfm2VLProcessingInfo]):
                 image_width=size.width,
                 image_height=size.height,
                 processor=hf_processor,
+                mm_kwargs=mm_kwargs,
             )
             for size in image_sizes
         ]
@@ -414,6 +436,7 @@ class Lfm2VLMultiModalProcessor(BaseMultiModalProcessor[Lfm2VLProcessingInfo]):
                 image_height=image_size.height,
                 spatial_shapes=spatial_shapes,
                 processor=hf_processor,
+                mm_kwargs=hf_processor_mm_kwargs,
             )
             return PromptUpdateDetails.select_text(
                 image_repl,
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 6edec9719..b3689ed19 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -1224,11 +1224,8 @@ class MolmoProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: MolmoProcessorWrapper | None,
+        processor: MolmoProcessorWrapper,
     ) -> int:
-        if processor is None:
-            processor = self.get_hf_processor()
-
         ncols, nrows = processor.get_patches_grid_size(
             image_width=image_width,
             image_height=image_height,
diff --git a/vllm/model_executor/models/molmo2.py b/vllm/model_executor/models/molmo2.py
index e0f74ce46..d32c034b5 100644
--- a/vllm/model_executor/models/molmo2.py
+++ b/vllm/model_executor/models/molmo2.py
@@ -1869,12 +1869,9 @@ class Molmo2ProcessingInfo(BaseProcessingInfo):
         *,
         image_height: int,
         image_width: int,
-        processor: Molmo2ProcessorWrapper | None = None,
+        processor: Molmo2ProcessorWrapper,
     ) -> int:
-        if processor is None:
-            processor = self.get_hf_processor()
-
-        hf_processor = processor.processor  # type: ignore
+        hf_processor = processor.processor
 
         resize_nrows, resize_cols = processor.get_base_grid_size(is_video=False)
         # start/end tokens + image patch token + col tokens
@@ -1897,11 +1894,8 @@ class Molmo2ProcessingInfo(BaseProcessingInfo):
         self,
         *,
         num_frames: int,
-        processor: Molmo2ProcessorWrapper | None = None,
+        processor: Molmo2ProcessorWrapper,
     ) -> int:
-        if processor is None:
-            processor = self.get_hf_processor()
-
         resize_nrows, resize_cols = processor.get_base_grid_size(is_video=True)
         # start/end tokens
         extra = 2 + resize_nrows * (
@@ -1929,7 +1923,9 @@ class Molmo2ProcessingInfo(BaseProcessingInfo):
             width = wr * crop_window_size + total_margin_pixels
 
             feat_size = self.get_num_image_tokens(
-                image_height=height, image_width=width, processor=processor
+                image_height=height,
+                image_width=width,
+                processor=processor,
             )
             if feat_size > largest_feature_size:
                 largest_feature_size = feat_size
@@ -1940,8 +1936,15 @@ class Molmo2ProcessingInfo(BaseProcessingInfo):
 
         return largest_feature_pinpoint
 
-    def _get_max_video_frames(self, max_tokens: int) -> int:
-        num_tokens_per_frame = self.get_num_video_tokens(num_frames=1)
+    def _get_max_video_frames(
+        self,
+        max_tokens: int,
+        processor: Molmo2ProcessorWrapper,
+    ) -> int:
+        num_tokens_per_frame = self.get_num_video_tokens(
+            num_frames=1,
+            processor=processor,
+        )
         max_frames = max_tokens // num_tokens_per_frame
         return max(max_frames, 1)
 
@@ -1950,10 +1953,11 @@ class Molmo2ProcessingInfo(BaseProcessingInfo):
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> int:
-        video_processor = self.get_hf_processor().processor.video_processor
+        processor = self.get_hf_processor()
+        video_processor = processor.processor.video_processor
         num_frames = video_processor.num_frames
         max_videos = mm_counts.get("video", 0)
-        max_total_frames = self._get_max_video_frames(seq_len)
+        max_total_frames = self._get_max_video_frames(seq_len, processor)
         max_frames_per_video = min(
             max_total_frames // max(max_videos, 1),
             num_frames,
diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py
index 69c0600d8..8d038d4ad 100644
--- a/vllm/model_executor/models/ovis2_5.py
+++ b/vllm/model_executor/models/ovis2_5.py
@@ -215,7 +215,7 @@ class Ovis2_5ProcessingInfo(BaseProcessingInfo):
         image_width: int,
         image_height: int,
         num_frames: int = 1,
-    ) -> tuple[ImageSize, int]:
+    ) -> int:
         hf_config = self.get_hf_config()
         vit_config = hf_config.vit_config
         patch_size = vit_config.patch_size
@@ -245,7 +245,6 @@ class Ovis2_5ProcessingInfo(BaseProcessingInfo):
                 image_width=target_width,
                 image_height=target_height,
                 num_frames=next_num_frames,
-                image_processor=None,
             )
             if next_max_tokens > max_tokens:
                 break
@@ -270,7 +269,6 @@ class Ovis2_5ProcessingInfo(BaseProcessingInfo):
         image_width: int,
         image_height: int,
         num_frames: int,
-        image_processor: BaseImageProcessor | None,
     ) -> int:
         num_video_tokens = self.get_num_image_tokens(
             image_width=image_width, image_height=image_height, num_frames=num_frames
@@ -287,7 +285,6 @@ class Ovis2_5ProcessingInfo(BaseProcessingInfo):
             image_width=target_width,
             image_height=target_height,
             num_frames=self.get_num_frames_with_most_features(seq_len, mm_counts),
-            image_processor=None,
         )
 
 
diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py
index 8d287e342..021f24e11 100644
--- a/vllm/model_executor/models/paddleocr_vl.py
+++ b/vllm/model_executor/models/paddleocr_vl.py
@@ -23,7 +23,7 @@ import numpy as np
 import torch
 import torch.nn as nn
 from einops import rearrange
-from transformers import BatchFeature, PretrainedConfig
+from transformers import BaseImageProcessor, BatchFeature, PretrainedConfig
 from transformers.activations import GELUActivation
 from transformers.modeling_outputs import (
     BaseModelOutputWithPooling,
@@ -147,21 +147,23 @@ class PaddleOCRVLProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        image_processor,
+        image_processor: BaseImageProcessor,
+        mm_kwargs: Mapping[str, object],
     ) -> int:
-        if image_processor is None:
-            image_processor = self.get_image_processor()
-
         hf_config = self.get_hf_config()
         vision_config = hf_config.vision_config
         patch_size = vision_config.patch_size
         merge_size = vision_config.spatial_merge_size
+
+        mm_kwargs = self.ctx.get_merged_mm_kwargs(mm_kwargs)
+        size = mm_kwargs.get("size", image_processor.size)
+
         resized_height, resized_width = smart_resize(
             height=image_height,
             width=image_width,
             factor=patch_size * merge_size,
-            min_pixels=image_processor.min_pixels,
-            max_pixels=image_processor.max_pixels,
+            min_pixels=size["min_pixels"],
+            max_pixels=size["max_pixels"],
         )
         preprocessed_size = ImageSize(width=resized_width, height=resized_height)
 
@@ -176,12 +178,13 @@ class PaddleOCRVLProcessingInfo(BaseProcessingInfo):
 
     def get_image_size_with_most_features(self) -> ImageSize:
         hf_config = self.get_hf_config()
+        image_processor = self.get_image_processor()
 
         # See `smart_resize` for the calculation of the image size.
         merge_size = hf_config.vision_config.spatial_merge_size
         patch_size = hf_config.vision_config.patch_size
         factor = merge_size * patch_size
-        max_num_tokens = self.get_image_processor().max_pixels // (factor**2)
+        max_num_tokens = image_processor.max_pixels // (factor**2)
         # Find factors of max_num_tokens close to its square root
         # to create a dummy image with a reasonable aspect ratio.
         h_patches = int(math.sqrt(max_num_tokens))
@@ -276,6 +279,7 @@ class PaddleOCRVLMultiModalProcessor(
                 image_width=image_size.width,
                 image_height=image_size.height,
                 image_processor=image_processor,
+                mm_kwargs=hf_processor_mm_kwargs,
             )
 
             return [image_token_id] * num_image_tokens
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 8f33cc859..a5a346e72 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -351,11 +351,8 @@ class Phi3VProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: ProcessorMixin | None = None,
+        processor: ProcessorMixin,
     ) -> int:
-        if processor is None:
-            processor = self.get_hf_processor()
-
         return processor.calc_num_image_tokens_from_image_size(  # type: ignore
             width=image_width,
             height=image_height,
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index d11483a6b..89676a9a7 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -558,10 +558,8 @@ class Phi4MMProcessingInfo(BaseProcessingInfo):
 
     def get_dynamic_hd(
         self,
-        processor: ProcessorMixin | None = None,
+        processor: ProcessorMixin,
     ) -> int:
-        if processor is None:
-            processor = self.get_hf_processor()
         image_processor = processor.image_processor
         return image_processor.dynamic_hd
 
@@ -715,7 +713,7 @@ class Phi4MMProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: ProcessorMixin | None = None,
+        processor: ProcessorMixin,
     ) -> int:
         hf_config = self.get_hf_config()
         vision_encoder_name = hf_config.img_processor
@@ -739,10 +737,9 @@ class Phi4MMProcessingInfo(BaseProcessingInfo):
 
         return image_num_tokens
 
-    def get_image_size_with_most_features(
-        self,
-        processor: ProcessorMixin | None = None,
-    ) -> ImageSize:
+    def get_image_size_with_most_features(self) -> ImageSize:
+        processor = self.get_hf_processor()
+
         hf_config = self.get_hf_config()
         vision_encoder_name = hf_config.img_processor
         if vision_encoder_name is None:
@@ -874,9 +871,12 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
             prompt, mm_data, mm_kwargs, tok_kwargs
         )
 
+        hf_processor = self.info.get_hf_processor(**mm_kwargs)
         num_img_tokens = [
             self.info.get_num_image_tokens(
-                image_width=img_size[0], image_height=img_size[1]
+                image_width=img_size[0],
+                image_height=img_size[1],
+                processor=hf_processor,
             )
             for img_size in processed_outputs["image_sizes"]
         ]
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 7d12cffcd..407cf3ff5 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -217,28 +217,13 @@ class PixtralProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
-    def get_vision_config(
-        self,
-        processor: PixtralProcessorAdapter | None = None,
-    ):
-        if processor is None:
-            processor = self.get_hf_processor()
-
-        return PixtralVisionConfig(
-            image_size=processor.image_size,
-            patch_size=processor.patch_size,
-        )
-
     def get_num_image_tokens(
         self,
         *,
         image_width: int,
         image_height: int,
-        processor: PixtralProcessorAdapter | None = None,
+        processor: PixtralProcessorAdapter,
     ) -> int:
-        if processor is None:
-            processor = self.get_hf_processor()
-
         ncols, nrows = processor.image_processor._image_to_num_tokens(
             Image.new("RGB", (image_width, image_height))
         )
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 62df900ad..1c568bdff 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -832,24 +832,25 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
         image_height: int,
         num_frames: int = 1,
         do_resize: bool = True,
-        image_processor: Qwen2VLImageProcessor | None,
+        image_processor: Qwen2VLImageProcessor,
+        mm_kwargs: Mapping[str, object],
     ) -> tuple[ImageSize, int]:
-        if image_processor is None:
-            image_processor = self.get_image_processor()
-
         hf_config = self.get_hf_config()
         vision_config = hf_config.vision_config
         patch_size = vision_config.patch_size
         merge_size = vision_config.spatial_merge_size
         temporal_patch_size = vision_config.temporal_patch_size
 
+        mm_kwargs = self.ctx.get_merged_mm_kwargs(mm_kwargs)
+        size = mm_kwargs.get("size", image_processor.size)
+
         if do_resize:
             resized_height, resized_width = smart_resize(
                 height=image_height,
                 width=image_width,
                 factor=patch_size * merge_size,
-                min_pixels=image_processor.size["shortest_edge"],
-                max_pixels=image_processor.size["longest_edge"],
+                min_pixels=size["shortest_edge"],
+                max_pixels=size["longest_edge"],
             )
             preprocessed_size = ImageSize(width=resized_width, height=resized_height)
         else:
@@ -873,13 +874,15 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        image_processor: Qwen2VLImageProcessor | None,
+        image_processor: Qwen2VLImageProcessor,
+        mm_kwargs: Mapping[str, object],
     ) -> int:
         _, num_image_tokens = self._get_vision_info(
             image_width=image_width,
             image_height=image_height,
             num_frames=1,
             image_processor=image_processor,
+            mm_kwargs=mm_kwargs,
         )
         return num_image_tokens
 
@@ -889,13 +892,15 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
         image_width: int,
         image_height: int,
         num_frames: int,
-        image_processor: Qwen2VLImageProcessor | None,
+        image_processor: Qwen2VLImageProcessor,
+        mm_kwargs: Mapping[str, object],
     ) -> int:
         _, num_video_tokens = self._get_vision_info(
             image_width=image_width,
             image_height=image_height,
             num_frames=num_frames,
             image_processor=image_processor,
+            mm_kwargs=mm_kwargs,
         )
         return num_video_tokens
 
@@ -941,15 +946,18 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
         return ImageSize(width=unit * width_factor, height=unit * height_factor)
 
     def get_max_image_tokens(self) -> int:
+        image_processor = self.get_image_processor()
         target_width, target_height = self.get_image_size_with_most_features()
 
         return self.get_num_image_tokens(
             image_width=target_width,
             image_height=target_height,
-            image_processor=None,
+            image_processor=image_processor,
+            mm_kwargs={},
         )
 
     def _get_max_video_frames(self, max_tokens: int, start_num_frames: int = 1) -> int:
+        image_processor = self.get_image_processor()
         target_width, target_height = self.get_image_size_with_most_features()
 
         num_frames = start_num_frames
@@ -960,7 +968,8 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
                 image_width=target_width,
                 image_height=target_height,
                 num_frames=next_num_frames,
-                image_processor=None,
+                image_processor=image_processor,
+                mm_kwargs={},
             )
 
             if next_max_tokens > max_tokens:
@@ -990,13 +999,15 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> int:
+        image_processor = self.get_image_processor()
         target_width, target_height = self.get_image_size_with_most_features()
 
         return self.get_num_video_tokens(
             image_width=target_width,
             image_height=target_height,
             num_frames=self.get_num_frames_with_most_features(seq_len, mm_counts),
-            image_processor=None,
+            image_processor=image_processor,
+            mm_kwargs={},
         )
 
 
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 7d9785141..c18fc77f7 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -642,13 +642,9 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
         image_height: int,
         num_frames: int = 2,
         do_resize: bool = True,
-        image_processor: Qwen2VLImageProcessorFast | Qwen3VLVideoProcessor | None,
+        image_processor: Qwen2VLImageProcessorFast | Qwen3VLVideoProcessor,
+        mm_kwargs: Mapping[str, object],
     ) -> tuple[ImageSize, int]:
-        if image_processor is None and num_frames > 1:
-            image_processor = self.get_video_processor()
-        elif image_processor is None:
-            image_processor = self.get_image_processor()
-
         is_video = isinstance(image_processor, Qwen3VLVideoProcessor)
 
         hf_config = self.get_hf_config()
@@ -657,6 +653,9 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
         merge_size = vision_config.spatial_merge_size
         temporal_patch_size = vision_config.temporal_patch_size
 
+        mm_kwargs = self.ctx.get_merged_mm_kwargs(mm_kwargs)
+        size = mm_kwargs.get("size", image_processor.size)
+
         if do_resize:
             if is_video:
                 smart_resize = video_smart_resize
@@ -667,12 +666,13 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
             else:
                 smart_resize = image_smart_resize
                 extra_kwargs = {}
+
             resized_height, resized_width = smart_resize(
                 height=image_height,
                 width=image_width,
                 factor=patch_size * merge_size,
-                min_pixels=image_processor.size["shortest_edge"],
-                max_pixels=image_processor.size["longest_edge"],
+                min_pixels=size["shortest_edge"],
+                max_pixels=size["longest_edge"],
                 **extra_kwargs,
             )
             preprocessed_size = ImageSize(width=resized_width, height=resized_height)
@@ -720,7 +720,8 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
             image_width=target_width,
             image_height=target_height,
             num_frames=2,
-            image_processor=None,
+            image_processor=video_processor,
+            mm_kwargs={},
         )
         return num_video_soft_tokens
 
@@ -846,6 +847,7 @@ class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]):
             image_height=target_video_height,
             num_frames=target_num_frames,
             image_processor=video_processor,
+            mm_kwargs={},
         )
         # NOTE: we need to do this check here since Qwen3-VL resizes video
         # frames depending on how many frames there are.
diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
index 4fadad14d..acedb04bc 100644
--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -487,11 +487,8 @@ class SkyworkR1VProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: SkyworkR1VProcessor | None,
+        processor: SkyworkR1VProcessor,
     ) -> int:
-        if processor is None:
-            processor = self.get_hf_processor()
-
         return processor.get_num_image_tokens(
             image_width=image_width,
             image_height=image_height,
diff --git a/vllm/model_executor/models/smolvlm.py b/vllm/model_executor/models/smolvlm.py
index e8b805297..aef00ec59 100644
--- a/vllm/model_executor/models/smolvlm.py
+++ b/vllm/model_executor/models/smolvlm.py
@@ -16,9 +16,7 @@ class SmolVLMProcessingInfo(Idefics3ProcessingInfo):
     def get_hf_processor(self, **kwargs: object) -> SmolVLMProcessor:
         return self.ctx.get_hf_processor(SmolVLMProcessor, **kwargs)
 
-    def _get_image_token(self, processor: SmolVLMProcessor | None) -> tuple[str, str]:
-        if processor is None:
-            processor = self.get_hf_processor()
+    def _get_image_token(self, processor: SmolVLMProcessor) -> tuple[str, str, str]:
         image_token = processor.image_token
         fake_image_token = processor.fake_image_token
         global_image_token = processor.global_image_token
diff --git a/vllm/multimodal/processing/context.py b/vllm/multimodal/processing/context.py
index d5c14310c..34a186710 100644
--- a/vllm/multimodal/processing/context.py
+++ b/vllm/multimodal/processing/context.py
@@ -409,6 +409,10 @@ class InputProcessingContext:
 
         return json_map_leaves(_postprocess_one, output)
 
+    def get_merged_mm_kwargs(self, kwargs: Mapping[str, object]):
+        mm_config = self.model_config.get_multimodal_config()
+        return mm_config.merge_mm_processor_kwargs(kwargs)
+
     def call_hf_processor(
         self,
         hf_processor: ProcessorMixin,
@@ -424,8 +428,7 @@ class InputProcessingContext:
         """
         assert callable(hf_processor)
 
-        mm_config = self.model_config.get_multimodal_config()
-        merged_kwargs = mm_config.merge_mm_processor_kwargs(kwargs)
+        merged_kwargs = self.get_merged_mm_kwargs(kwargs)
 
         allowed_kwargs = get_allowed_kwarg_only_overrides(
             hf_processor,