nano-nemotron-vl: get_mm_max_tokens_per_item for audio, video, image == seq_len (#38727)

Signed-off-by: Netanel Haber <58652339+netanel-haber@users.noreply.github.com>
This commit is contained in:
Netanel Haber
2026-04-07 10:23:29 +03:00
committed by GitHub
parent 5c35517a3e
commit a9a0e0551f

View File

@@ -288,6 +288,35 @@ class NanoNemotronVLProcessingInfo(BaseProcessingInfo):
max_num_tiles=max_num_tiles,
)
def get_dummy_image_size_and_max_tokens(
self, mm_counts: Mapping[str, int]
) -> tuple[tuple[int, int], int]:
processor = self.get_hf_processor()
num_images = mm_counts.get("image", 0)
if tiler := processor.dynamic_tiler:
budget = tiler.max_num_tokens_available(text_prompt_length=num_images)
target_width, target_height = (
tiler.width_and_height_for_max_num_tokens_available(budget)
)
return (
(target_width, target_height),
tiler._get_num_embeddings(target_width, target_height),
)
max_num_tiles = processor.max_num_tiles
target_width, target_height = self.get_image_size_with_most_features(
max_num_tiles
)
return (
(target_width, target_height),
processor.get_num_image_tokens(
image_width=target_width,
image_height=target_height,
max_num_tiles=max_num_tiles,
),
)
def get_num_frames_with_most_features(
self,
seq_len: int,
@@ -306,6 +335,26 @@ class NanoNemotronVLProcessingInfo(BaseProcessingInfo):
max_frames_per_video = max_tubelets_per_video * T
return max(max_frames_per_video, 1)
def get_mm_max_tokens_per_item(
self, seq_len: int, mm_counts: Mapping[str, int]
) -> Mapping[str, int]:
mm_max_tokens: dict[str, int] = {}
if mm_counts.get("image", 0) > 0:
_, mm_max_tokens["image"] = self.get_dummy_image_size_and_max_tokens(
mm_counts
)
if mm_counts.get("video", 0) > 0:
assert self.supports_video
mm_max_tokens["video"] = seq_len
if mm_counts.get("audio", 0) > 0:
assert self.supports_audio
mm_max_tokens["audio"] = seq_len
return mm_max_tokens
class NanoNemotronVLMultiModalProcessor(
BaseMultiModalProcessor[NanoNemotronVLProcessingInfo]
@@ -708,17 +757,10 @@ class NanoNemotronVLDummyInputsBuilder(
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)
(target_width, target_height), _ = (
self.info.get_dummy_image_size_and_max_tokens(mm_counts)
)
processor = self.info.get_hf_processor()
if tiler := processor.dynamic_tiler:
budget = tiler.max_num_tokens_available(text_prompt_length=num_images)
target_width, target_height = (
tiler.width_and_height_for_max_num_tokens_available(budget)
)
else:
max_num_tiles = 12
target_width, target_height = self.info.get_image_size_with_most_features(
max_num_tiles
)
image_overrides = mm_options.get("image")