nano-nemotron-vl: get_mm_max_tokens_per_item for audio, video, image == seq_len (#38727)

Signed-off-by: Netanel Haber <58652339+netanel-haber@users.noreply.github.com>
2026-04-07 10:23:29 +03:00
parent 5c35517a3e
commit a9a0e0551f
1 changed files with 52 additions and 10 deletions
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -288,6 +288,35 @@ class NanoNemotronVLProcessingInfo(BaseProcessingInfo):
            max_num_tiles=max_num_tiles,
        )

+    def get_dummy_image_size_and_max_tokens(
+        self, mm_counts: Mapping[str, int]
+    ) -> tuple[tuple[int, int], int]:
+        processor = self.get_hf_processor()
+        num_images = mm_counts.get("image", 0)
+
+        if tiler := processor.dynamic_tiler:
+            budget = tiler.max_num_tokens_available(text_prompt_length=num_images)
+            target_width, target_height = (
+                tiler.width_and_height_for_max_num_tokens_available(budget)
+            )
+            return (
+                (target_width, target_height),
+                tiler._get_num_embeddings(target_width, target_height),
+            )
+
+        max_num_tiles = processor.max_num_tiles
+        target_width, target_height = self.get_image_size_with_most_features(
+            max_num_tiles
+        )
+        return (
+            (target_width, target_height),
+            processor.get_num_image_tokens(
+                image_width=target_width,
+                image_height=target_height,
+                max_num_tiles=max_num_tiles,
+            ),
+        )
+
    def get_num_frames_with_most_features(
        self,
        seq_len: int,
@@ -306,6 +335,26 @@ class NanoNemotronVLProcessingInfo(BaseProcessingInfo):
        max_frames_per_video = max_tubelets_per_video * T
        return max(max_frames_per_video, 1)

+    def get_mm_max_tokens_per_item(
+        self, seq_len: int, mm_counts: Mapping[str, int]
+    ) -> Mapping[str, int]:
+        mm_max_tokens: dict[str, int] = {}
+
+        if mm_counts.get("image", 0) > 0:
+            _, mm_max_tokens["image"] = self.get_dummy_image_size_and_max_tokens(
+                mm_counts
+            )
+
+        if mm_counts.get("video", 0) > 0:
+            assert self.supports_video
+            mm_max_tokens["video"] = seq_len
+
+        if mm_counts.get("audio", 0) > 0:
+            assert self.supports_audio
+            mm_max_tokens["audio"] = seq_len
+
+        return mm_max_tokens
+

 class NanoNemotronVLMultiModalProcessor(
    BaseMultiModalProcessor[NanoNemotronVLProcessingInfo]
@@ -708,17 +757,10 @@ class NanoNemotronVLDummyInputsBuilder(
        mm_options: Mapping[str, BaseDummyOptions],
    ) -> MultiModalDataDict:
        num_images = mm_counts.get("image", 0)
+        (target_width, target_height), _ = (
+            self.info.get_dummy_image_size_and_max_tokens(mm_counts)
+        )
        processor = self.info.get_hf_processor()
-        if tiler := processor.dynamic_tiler:
-            budget = tiler.max_num_tokens_available(text_prompt_length=num_images)
-            target_width, target_height = (
-                tiler.width_and_height_for_max_num_tokens_available(budget)
-            )
-        else:
-            max_num_tiles = 12
-            target_width, target_height = self.info.get_image_size_with_most_features(
-                max_num_tiles
-            )

        image_overrides = mm_options.get("image")