[Model] Allow passing custom number of max tiles to Nano 2 VL (#26403)
Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
This commit is contained in:
committed by
GitHub
parent
f377333bd7
commit
f9582fd8f4
@@ -93,6 +93,7 @@ IMG_CONTEXT = "<image>"
|
|||||||
|
|
||||||
# Profiling
|
# Profiling
|
||||||
MAX_FRAMES = 16
|
MAX_FRAMES = 16
|
||||||
|
DEFAULT_NUM_TILES = 12
|
||||||
|
|
||||||
|
|
||||||
class NanoNemotronVLImagePixelInputs(TypedDict):
|
class NanoNemotronVLImagePixelInputs(TypedDict):
|
||||||
@@ -227,6 +228,8 @@ def video_to_pixel_values(
|
|||||||
max_num_tiles: int = 1,
|
max_num_tiles: int = 1,
|
||||||
use_thumbnail: bool,
|
use_thumbnail: bool,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
|
assert max_num_tiles == 1, "Video modality always uses one tile"
|
||||||
|
|
||||||
# Convert each frame to a single resized tile tensor consistent
|
# Convert each frame to a single resized tile tensor consistent
|
||||||
# with image path
|
# with image path
|
||||||
frames_tensors: list[torch.Tensor] = []
|
frames_tensors: list[torch.Tensor] = []
|
||||||
@@ -255,13 +258,19 @@ class BaseNanoNemotronVLProcessor(ABC):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, config: PretrainedConfig, tokenizer: AnyTokenizer, *args, **kwargs
|
self,
|
||||||
|
config: PretrainedConfig,
|
||||||
|
tokenizer: AnyTokenizer,
|
||||||
|
*args,
|
||||||
|
max_num_tiles: Optional[int] = None,
|
||||||
|
**kwargs,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.config = config
|
self.config = config
|
||||||
self.tokenizer = tokenizer
|
self.tokenizer = tokenizer
|
||||||
|
|
||||||
|
self.max_num_tiles = max_num_tiles or DEFAULT_NUM_TILES
|
||||||
image_size: int = config.force_image_size
|
image_size: int = config.force_image_size
|
||||||
patch_size: int = config.patch_size
|
patch_size: int = config.patch_size
|
||||||
|
|
||||||
@@ -361,7 +370,7 @@ class BaseNanoNemotronVLProcessor(ABC):
|
|||||||
) -> BatchFeature:
|
) -> BatchFeature:
|
||||||
# Use default if not provided
|
# Use default if not provided
|
||||||
if max_num_tiles is None:
|
if max_num_tiles is None:
|
||||||
max_num_tiles = 12
|
max_num_tiles = self.max_num_tiles
|
||||||
|
|
||||||
text, images = [self._make_batch_input(x) for x in (text, images)]
|
text, images = [self._make_batch_input(x) for x in (text, images)]
|
||||||
|
|
||||||
@@ -390,6 +399,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
|
|||||||
config: PretrainedConfig,
|
config: PretrainedConfig,
|
||||||
tokenizer: AnyTokenizer,
|
tokenizer: AnyTokenizer,
|
||||||
*,
|
*,
|
||||||
|
max_num_tiles: Optional[int] = None,
|
||||||
min_dynamic_patch: Optional[int] = None,
|
min_dynamic_patch: Optional[int] = None,
|
||||||
max_dynamic_patch: Optional[int] = None,
|
max_dynamic_patch: Optional[int] = None,
|
||||||
dynamic_image_size: Optional[bool] = None,
|
dynamic_image_size: Optional[bool] = None,
|
||||||
@@ -399,6 +409,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
|
|||||||
super().__init__(
|
super().__init__(
|
||||||
config=config,
|
config=config,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
|
max_num_tiles=max_num_tiles,
|
||||||
min_dynamic_patch=min_dynamic_patch,
|
min_dynamic_patch=min_dynamic_patch,
|
||||||
max_dynamic_patch=max_dynamic_patch,
|
max_dynamic_patch=max_dynamic_patch,
|
||||||
dynamic_image_size=dynamic_image_size,
|
dynamic_image_size=dynamic_image_size,
|
||||||
@@ -506,7 +517,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
|
|||||||
) -> BatchFeature:
|
) -> BatchFeature:
|
||||||
# Use default if not provided
|
# Use default if not provided
|
||||||
if max_num_tiles is None:
|
if max_num_tiles is None:
|
||||||
max_num_tiles = 12
|
max_num_tiles = self.max_num_tiles
|
||||||
|
|
||||||
text, images, videos = [
|
text, images, videos = [
|
||||||
self._make_batch_input(x) for x in (text, images, videos)
|
self._make_batch_input(x) for x in (text, images, videos)
|
||||||
@@ -521,7 +532,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
|
|||||||
text, video_inputs = self._preprocess_video(
|
text, video_inputs = self._preprocess_video(
|
||||||
text=text,
|
text=text,
|
||||||
videos=videos,
|
videos=videos,
|
||||||
max_num_tiles=max_num_tiles,
|
max_num_tiles=1,
|
||||||
dynamic_image_size=dynamic_image_size,
|
dynamic_image_size=dynamic_image_size,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -635,7 +646,7 @@ class BaseNanoNemotronVLProcessingInfo(BaseProcessingInfo):
|
|||||||
def get_max_image_tokens(self) -> int:
|
def get_max_image_tokens(self) -> int:
|
||||||
processor = self.get_hf_processor()
|
processor = self.get_hf_processor()
|
||||||
# Use default max_num_tiles for max tokens calculation
|
# Use default max_num_tiles for max tokens calculation
|
||||||
max_num_tiles = 12
|
max_num_tiles = processor.max_num_tiles
|
||||||
target_width, target_height = self.get_image_size_with_most_features(
|
target_width, target_height = self.get_image_size_with_most_features(
|
||||||
max_num_tiles
|
max_num_tiles
|
||||||
)
|
)
|
||||||
@@ -768,7 +779,9 @@ class NanoNemotronBaseVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
|||||||
else:
|
else:
|
||||||
image_size = images.get_image_size(item_idx)
|
image_size = images.get_image_size(item_idx)
|
||||||
# Extract max_num_tiles from kwargs, default to 12
|
# Extract max_num_tiles from kwargs, default to 12
|
||||||
max_num_tiles = hf_processor_mm_kwargs.get("max_num_tiles", 12)
|
max_num_tiles = hf_processor_mm_kwargs.get(
|
||||||
|
"max_num_tiles", hf_processor.max_num_tiles
|
||||||
|
)
|
||||||
feature_size = self.info.get_num_image_tokens(
|
feature_size = self.info.get_num_image_tokens(
|
||||||
image_width=image_size.width,
|
image_width=image_size.width,
|
||||||
image_height=image_size.height,
|
image_height=image_size.height,
|
||||||
|
|||||||
Reference in New Issue
Block a user