[VLM] Remove BaseProcessingInfo.get_mm_max_tokens_per_item (#16408)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-11 00:06:58 +08:00
parent 7678fcd5b6
commit 83b824c8b4
39 changed files with 104 additions and 677 deletions
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -408,13 +408,6 @@ class AriaProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": None}

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_num_image_tokens()}
-
    def get_num_image_tokens(self) -> int:
        hf_config = self.get_hf_config()
        return max(hf_config.projector_patch_to_query_dict.values())
--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@@ -117,31 +117,6 @@ class AyaVisionProcessingInfo(BaseProcessingInfo):
    def get_image_processor(self) -> GotOcr2ImageProcessor:
        return self.get_hf_processor().image_processor

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_max_image_tokens()}
-
-    def get_max_image_tokens(self) -> int:
-        hf_processor = self.get_hf_processor()
-        image_processor = hf_processor.image_processor
-
-        image_size = self.get_image_size_with_most_features()
-        num_patches = self.get_num_patches(
-            image_width=image_size.width,
-            image_height=image_size.height,
-            size=image_processor.size,
-            min_patches=image_processor.min_patches,
-            max_patches=image_processor.max_patches,
-        )
-
-        img_patches_per_tile = (hf_processor.img_size //
-                                hf_processor.patch_size)**2
-
-        return num_patches * img_patches_per_tile
-
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": None}

--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -406,13 +406,6 @@ class Blip2ProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": 1}

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_num_image_tokens()}
-
    def get_num_image_tokens(self) -> int:
        hf_config = self.get_hf_config()
        return hf_config.num_query_tokens
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -64,13 +64,6 @@ class ChameleonProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": 1}

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_num_image_tokens()}
-
    def get_num_image_tokens(self) -> int:
        processor = self.get_hf_processor()
        return processor.image_seq_length
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -30,9 +30,6 @@ class CLIPEncoderInfo(VisionEncoderInfo[CLIPVisionConfig]):
    ) -> int:
        return self.get_patch_grid_length()**2 + 1

-    def get_max_image_tokens(self) -> int:
-        return self.get_patch_grid_length()**2 + 1
-
    def get_image_size(self) -> int:
        return self.vision_config.image_size

--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -168,20 +168,6 @@ class DeepseekVL2ProcessingInfo(BaseProcessingInfo):
                                image_width=x[1], image_height=x[0]))
        return ImageSize(width=width, height=height)

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        num_images = mm_counts.get("image", 0)
-        max_image_size = self.get_image_size_with_most_features()
-        max_image_tokens = self.get_num_image_tokens(
-            image_height=max_image_size.height,
-            image_width=max_image_size.width,
-            cropping=num_images <= 2)
-
-        return {"image": max_image_tokens}
-

 class DeepseekVL2DummyInputsBuilder(
        BaseDummyInputsBuilder[DeepseekVL2ProcessingInfo]):
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -764,17 +764,10 @@ class Florence2ProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": 1}

-    def get_max_image_tokens(self) -> int:
+    def get_num_image_tokens(self) -> int:
        processor_config = self.ctx.get_hf_image_processor_config()
        return processor_config["image_seq_length"]

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_max_image_tokens()}
-

 class Florence2DummyInputsBuilder(
        BaseDummyInputsBuilder[Florence2ProcessingInfo]):
@@ -871,7 +864,7 @@ class Florence2MultiModalProcessor(
    ) -> Sequence[PromptUpdate]:
        hf_config = self.info.get_hf_config()
        pad_token_id = hf_config.pad_token_id
-        num_image_tokens = self.info.get_max_image_tokens()
+        num_image_tokens = self.info.get_num_image_tokens()
        image_tokens = [pad_token_id] * num_image_tokens

        return [
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -80,13 +80,6 @@ class FuyuProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": 1}

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_max_image_tokens()}
-
    def get_image_feature_grid_size(
        self,
        *,
@@ -129,14 +122,6 @@ class FuyuProcessingInfo(BaseProcessingInfo):
        return ImageSize(width=image_processor.size["width"],
                         height=image_processor.size["height"])

-    def get_max_image_tokens(self) -> int:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        return self.get_num_image_tokens(
-            image_width=target_width,
-            image_height=target_height,
-        )
-

 class FuyuDummyInputsBuilder(BaseDummyInputsBuilder[FuyuProcessingInfo]):

--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -68,13 +68,6 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": None}

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_max_image_tokens()}
-
    def _resolve_image_kwargs(
        self,
        processor: Gemma3Processor,
@@ -228,15 +221,6 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
        # Result in the max possible feature size (h:w = max_num_crops:1)
        return ImageSize(height=50 * max_num_crops, width=50)

-    def get_max_image_tokens(self) -> int:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        return self.get_num_image_tokens(
-            image_width=target_width,
-            image_height=target_height,
-            processor=None,
-        )
-

 class Gemma3DummyInputsBuilder(BaseDummyInputsBuilder[Gemma3ProcessingInfo]):

--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -431,13 +431,6 @@ class GLM4VProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": 1}

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_num_image_feature_tokens()}
-
    def get_num_image_tokens(self) -> int:
        hf_config = self.get_hf_config()
        vision_config = hf_config.vision_config
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -412,19 +412,6 @@ class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
            **kwargs,
        )

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        max_tokens_one_image = self.get_max_image_tokens(use_msac=None)
-        if mm_counts.get("image", 0) <= 1:
-            max_tokens_per_image = max_tokens_one_image
-        else:
-            max_tokens_per_image = self.get_max_image_tokens(use_msac=False)
-
-        return {"image": max_tokens_per_image}
-
    def get_num_image_tokens(
        self,
        *,
@@ -442,16 +429,6 @@ class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
            use_msac=use_msac,
        )

-    def get_max_image_tokens(self, use_msac: Optional[bool] = None) -> int:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        return self.get_num_image_tokens(
-            image_width=target_width,
-            image_height=target_height,
-            processor=None,
-            use_msac=use_msac,
-        )
-

 class H2OVLMultiModalProcessor(InternVLMultiModalProcessor[H2OVLProcessingInfo]
                               ):
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -97,13 +97,6 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": None}

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_max_image_tokens()}
-
    def _resize_output_size(self,
                            *,
                            height: int,
@@ -287,15 +280,6 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
            height=image_processor.size["longest_edge"],
        )

-    def get_max_image_tokens(self) -> int:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        return self.get_num_image_tokens(
-            image_width=target_width,
-            image_height=target_height,
-            processor=None,
-        )
-

 class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo]
                                 ):
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -458,13 +458,6 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": None}

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_max_image_tokens()}
-
    def get_num_image_tokens(
        self,
        *,
@@ -480,15 +473,6 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
            image_height=image_height,
        )

-    def get_max_image_tokens(self) -> int:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        return self.get_num_image_tokens(
-            image_width=target_width,
-            image_height=target_height,
-            processor=None,
-        )
-
    def get_image_size_with_most_features(self) -> ImageSize:
        processor = self.get_hf_processor()

--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -137,13 +137,6 @@ class BaseLlavaProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": None}

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_max_image_tokens()}
-
    def _apply_feature_select_strategy(
        self,
        strategy: str,
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -61,22 +61,6 @@ class LlavaNextVideoProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"video": 1}

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        max_video_tokens = self.get_num_video_tokens(
-            image_width=target_width,
-            image_height=target_height,
-            num_frames=self.get_num_frames_with_most_features(
-                seq_len, mm_counts),
-        )
-
-        return {"video": max_video_tokens}
-
    def get_image_size_with_most_features(self) -> ImageSize:
        vision_encoder_info = self.get_vision_encoder_info()
        width = height = vision_encoder_info.get_image_size()
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -101,16 +101,6 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": None, "video": None}

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {
-            "image": self.get_max_image_tokens(),
-            "video": self.get_max_video_tokens(seq_len, mm_counts),
-        }
-
    # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
    # with additional logic afterwards taken from LlavaOnevisionProcessor
    def _get_num_unpadded_features(
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -142,17 +142,6 @@ class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {**super().get_supported_mm_limits(), "audio": None}

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {
-            **super().get_mm_max_tokens_per_item(seq_len, mm_counts),
-            "audio":
-            self.get_max_audio_tokens(),
-        }
-
    def get_audio_placeholder(
        self,
        audio_lens: int,
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -346,18 +346,6 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):

        return mm_limits

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        mm_max_tokens = {"image": self.get_max_image_tokens()}
-        if self.get_model_version() == (2, 6):
-            mm_max_tokens["video"] = self.get_max_video_tokens(
-                seq_len, mm_counts)
-
-        return mm_max_tokens
-
    def get_slice_image_placeholder(
        self,
        image_size: ImageSize,
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -162,13 +162,6 @@ class BaseLlavaProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": None}

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_max_image_tokens()}
-
    def get_num_image_tokens(
        self,
        *,
@@ -186,14 +179,6 @@ class BaseLlavaProcessingInfo(BaseProcessingInfo):
        width = height = vision_encoder_info.get_image_size()
        return ImageSize(width=width, height=height)

-    def get_max_image_tokens(self) -> int:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        return self.get_num_image_tokens(
-            image_width=target_width,
-            image_height=target_height,
-        )
-

 _I = TypeVar("_I", bound=BaseLlavaProcessingInfo)

--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -106,16 +106,6 @@ class MllamaProcessingInfo(BaseProcessingInfo):
        image_size = self.get_hf_config().vision_config.image_size
        return calc_token_per_chunk(image_size)

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        vision_config = self.get_hf_config().vision_config
-        token_per_chunk = self.get_token_per_chunk_from_config()
-        mm_max_tokens = vision_config.max_num_tiles * token_per_chunk
-        return {"image": mm_max_tokens}
-
    def get_num_tiles_per_image(self, image_height: int,
                                image_width: int) -> int:
        vision_config = self.get_hf_config().vision_config
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -498,17 +498,6 @@ class Mllama4ProcessingInfo(BaseProcessingInfo):
        image_processor = self.get_hf_processor().image_processor
        return image_processor.max_patches

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        vision_config = self.get_hf_config().vision_config
-        patch_per_chunk = self.get_patch_per_chunk(vision_config)
-        num_patches = self.get_max_num_tiles() + 1
-
-        return {"image": patch_per_chunk * num_patches}
-
    def get_image_size_with_most_features(self) -> ImageSize:
        vision_config = self.get_hf_config().vision_config
        image_size = vision_config.image_size
@@ -516,14 +505,6 @@ class Mllama4ProcessingInfo(BaseProcessingInfo):
        return ImageSize(height=self.get_max_num_tiles() * image_size,
                         width=image_size)

-    def get_max_image_tokens(self) -> int:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        return self.get_num_image_tokens(
-            image_width=target_width,
-            image_height=target_height,
-        )
-

 class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]
                                 ):
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -1164,13 +1164,6 @@ class MolmoProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": None}

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_max_image_tokens()}
-
    def get_num_image_tokens(
        self,
        *,
@@ -1195,15 +1188,6 @@ class MolmoProcessingInfo(BaseProcessingInfo):

        return extra + joint

-    def get_max_image_tokens(self) -> int:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        return self.get_num_image_tokens(
-            image_width=target_width,
-            image_height=target_height,
-            processor=None,
-        )
-
    def get_image_size_with_most_features(self) -> ImageSize:
        processor = self.get_hf_processor()

--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -13,7 +13,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                    MultiModalInputs, MultiModalKwargs)
-from vllm.multimodal.parse import MultiModalDataItems
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                        BaseProcessingInfo, PromptIndexTargets,
                                        PromptInsertion, PromptUpdate,
@@ -72,16 +73,18 @@ class PaliGemmaProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": 1}

-    def get_mm_max_tokens_per_item(
+    def get_num_image_tokens(
        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_num_image_tokens()}
-
-    def get_num_image_tokens(self) -> int:
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
        vision_encoder_info = self.get_vision_encoder_info()
-        return vision_encoder_info.get_max_image_tokens()
+
+        return vision_encoder_info.get_num_image_tokens(
+            image_width=image_width,
+            image_height=image_height,
+        )


 class PaliGemmaDummyInputsBuilder(
@@ -148,12 +151,30 @@ class PaliGemmaMultiModalProcessor(
        image_token_id = hf_config.image_token_index

        tokenizer = self.info.get_tokenizer()
-        num_image_tokens = self.info.get_num_image_tokens()
-        image_tokens = [image_token_id] * num_image_tokens

        bos_token_id = tokenizer.bos_token_id
        assert isinstance(bos_token_id, int)

+        def get_insertion(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                num_image_tokens = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+                num_image_tokens = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                )
+
+            image_tokens = [image_token_id] * num_image_tokens
+
+            return PromptUpdateDetails.select_token_id(
+                image_tokens + [bos_token_id],
+                embed_token_id=image_token_id,
+            )
+
        # Paligemma 1 and 2 have different tokenizer.add_bos_token
        # Insert <image>*n + <bos> after <bos> for Paligemma 1
        # Insert <image>*n + <bos> for Paligemma 2
@@ -162,10 +183,7 @@ class PaliGemmaMultiModalProcessor(
                modality="image",
                target=PromptIndexTargets.prefix(
                    [bos_token_id] if tokenizer.add_bos_token else []),
-                insertion=PromptUpdateDetails.select_token_id(
-                    image_tokens + [bos_token_id],
-                    embed_token_id=image_token_id,
-                ),
+                insertion=get_insertion,
            )
        ]

--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -321,21 +321,6 @@ class Phi3VProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": None}

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        max_image_tokens = self.get_num_image_tokens(
-            image_width=target_width,
-            image_height=target_height,
-            processor=None,
-        )
-
-        return {"image": max_image_tokens}
-
    def get_num_image_tokens(
        self,
        *,
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -167,13 +167,6 @@ class PixtralProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": None}

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_max_image_tokens()}
-
    def get_vision_config(
        self,
        processor: Optional[PixtralProcessorAdapter] = None,
@@ -207,14 +200,6 @@ class PixtralProcessingInfo(BaseProcessingInfo):

        return ImageSize(width=max_image_size, height=max_image_size)

-    def get_max_image_tokens(self) -> int:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        return self.get_num_image_tokens(
-            image_width=target_width,
-            image_height=target_height,
-        )
-

 class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]):

@@ -938,14 +923,6 @@ class PixtralHFEncoderInfo(VisionEncoderInfo[PixtralVisionConfig]):
        )
        return ncols * nrows

-    def get_max_image_tokens(self) -> int:
-        image_size = self.get_image_size()
-
-        return self.get_num_image_tokens(
-            image_width=image_size,
-            image_height=image_size,
-        )
-
    def get_image_size(self) -> int:
        return self.vision_config.image_size

--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -45,9 +45,6 @@ class PrithviGeoSpatialMAEProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": None}

-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        return {"image": 0}
-

 class PrithviGeoSpatialMAEInputBuilder(
        BaseDummyInputsBuilder[PrithviGeoSpatialMAEProcessingInfo]):
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -109,17 +109,6 @@ class Qwen2AudioProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"audio": None}

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        hf_config = self.get_hf_config()
-        max_source_positions = hf_config.audio_config.max_source_positions
-        max_output_lengths = (max_source_positions - 2) // 2 + 1
-
-        return {"audio": max_output_lengths}
-

 class Qwen2AudioDummyInputsBuilder(
        BaseDummyInputsBuilder[Qwen2AudioProcessingInfo]):
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -818,16 +818,6 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": None, "video": None}

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {
-            "image": self.get_max_image_tokens(),
-            "video": self.get_max_video_tokens(seq_len, mm_counts),
-        }
-
    def _get_vision_info(
        self,
        *,
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -530,13 +530,6 @@ class QwenVLProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": None}

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_num_image_tokens()}
-
    def get_num_image_tokens(self) -> int:
        hf_config = self.get_hf_config()
        vision_config = hf_config.visual
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -33,9 +33,6 @@ class SiglipEncoderInfo(VisionEncoderInfo[SiglipVisionConfig]):
    ) -> int:
        return self.get_patch_grid_length()**2

-    def get_max_image_tokens(self) -> int:
-        return self.get_patch_grid_length()**2
-
    def get_image_size(self) -> int:
        return self.vision_config.image_size

--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -459,13 +459,6 @@ class BaseSkyworkR1VProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": None}

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_max_image_tokens()}
-
    def get_num_image_tokens(
        self,
        *,
@@ -481,15 +474,6 @@ class BaseSkyworkR1VProcessingInfo(BaseProcessingInfo):
            image_height=image_height,
        )

-    def get_max_image_tokens(self) -> int:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        return self.get_num_image_tokens(
-            image_width=target_width,
-            image_height=target_height,
-            processor=None,
-        )
-
    def get_image_size_with_most_features(self) -> ImageSize:
        processor = self.get_hf_processor()

--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -2,7 +2,6 @@

 # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py
 """PyTorch Ultravox model."""
-import math
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
 from typing import Any, Literal, Optional, Set, Tuple, TypedDict, Union
@@ -107,17 +106,6 @@ class UltravoxProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"audio": None}

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        feature_extractor = self.get_feature_extractor()
-        max_audio_tokens = math.ceil(feature_extractor.chunk_length *
-                                     _AUDIO_TOKENS_PER_SECOND)
-
-        return {"audio": max_audio_tokens * _MAX_ENCODER_BATCH_SIZE}
-

 class UltravoxDummyInputsBuilder(BaseDummyInputsBuilder[UltravoxProcessingInfo]
                                 ):
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -33,10 +33,6 @@ class VisionEncoderInfo(ABC, Generic[_C]):
    ) -> int:
        raise NotImplementedError

-    @abstractmethod
-    def get_max_image_tokens(self) -> int:
-        raise NotImplementedError
-
    @abstractmethod
    def get_image_size(self) -> int:
        raise NotImplementedError
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -538,16 +538,9 @@ class WhisperProcessingInfo(BaseProcessingInfo):
        assert isinstance(feature_extractor, WhisperFeatureExtractor)
        return feature_extractor

-    def get_max_audio_tokens(self) -> int:
+    def get_num_audio_tokens(self) -> int:
        return self.get_hf_config().max_source_positions

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"audio": self.get_max_audio_tokens()}
-

 class WhisperDummyInputsBuilder(BaseDummyInputsBuilder[WhisperProcessingInfo]):

@@ -630,7 +623,7 @@ class WhisperMultiModalProcessor(
        hf_processor_mm_kwargs: Mapping[str, object],
        out_mm_kwargs: MultiModalKwargs,
    ) -> Sequence[PromptUpdate]:
-        num_tokens = self.info.get_max_audio_tokens()
+        num_tokens = self.info.get_num_audio_tokens()
        return [
            PromptReplacement(
                modality="audio",