diff --git a/tests/models/registry.py b/tests/models/registry.py index e8b0f2191..895dc4579 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -7,6 +7,7 @@ from typing import Any, Literal import pytest from packaging.version import Version +from transformers import PretrainedConfig from transformers import __version__ as TRANSFORMERS_VERSION from vllm.config.model import ModelDType, TokenizerMode @@ -1004,7 +1005,26 @@ _MULTIMODAL_EXAMPLE_MODELS = { trust_remote_code=True, ), "NemotronH_Nano_VL_V2": _HfExamplesInfo( - "nano_vl_dummy", is_available_online=False, trust_remote_code=True + "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16", + max_model_len=4096, + # NemotronH layers are constructed via `hybrid_override_pattern`: + use_original_num_layers=True, + hf_overrides={ + "vision_config": PretrainedConfig( + args={ + "min_num_patches": 1, # Trigger image dynamic res + "max_num_patches": 12, + "model": "vit_huge_patch16_224", + }, + # Trigger conv3d: + video_temporal_patch_size=2, + ), + "text_config": { + "num_hidden_layers": 2, + "hybrid_override_pattern": "M*", + }, + }, + trust_remote_code=True, ), "OpenCUAForConditionalGeneration": _HfExamplesInfo( "xlangai/OpenCUA-7B", trust_remote_code=True diff --git a/tests/models/utils.py b/tests/models/utils.py index 4830f18dc..6d6636c96 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -447,9 +447,16 @@ def dummy_hf_overrides( Dummy HF overrides function used to create dummy model with only minimum nums of layer. """ - hf_config.update(exist_overrides or {}) + # Copy because this helper is called more than once + # while loading config, and we `.pop()` + exist_overrides = (exist_overrides or {}).copy() + text_config_override = exist_overrides.pop("text_config", None) + hf_config.update(exist_overrides) text_config = hf_config.get_text_config() + if text_config_override is not None: + # multimodal test models may override *some* text-model fields + text_config.update(text_config_override) # Ensure at least 2 expert per group # Since `grouped_topk` assumes top-2 diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 90425417a..819bb4a3c 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -7,7 +7,6 @@ # LICENSE is in root directory. # -------------------------------------------------------- -import copy import math import warnings from collections.abc import Iterable, Mapping, Sequence @@ -17,7 +16,7 @@ from typing import Annotated, Literal, TypeAlias import torch import torch.nn as nn -from transformers import BatchFeature +from transformers import BatchFeature, PretrainedConfig from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions @@ -210,11 +209,15 @@ class NanoNemotronVLProcessingInfo(BaseProcessingInfo): @cached_property def is_dynamic_tiler(self) -> bool: - return self.get_hf_processor().dynamic_tiler is not None + return BaseNanoNemotronVLProcessor.use_dynamic_resolution(self.get_hf_config()) - @cached_property + @property def supports_video(self): - return self.get_hf_processor().supports_video + return True + + @property + def supports_audio(self) -> bool: + return self.sound_config is not None def get_video_token(self) -> str | None: return IMG_CONTEXT @@ -223,8 +226,8 @@ class NanoNemotronVLProcessingInfo(BaseProcessingInfo): return self.ctx.get_mm_config().video_pruning_rate @property - def audio_extractor(self) -> ParakeetExtractor | None: - return self.get_hf_processor().audio_extractor + def sound_config(self) -> PretrainedConfig | None: + return getattr(self.get_hf_config(), "sound_config", None) def get_default_tok_params(self) -> TokenizeParams: return super().get_default_tok_params().with_kwargs(add_special_tokens=False) @@ -232,14 +235,14 @@ class NanoNemotronVLProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, int | None]: image_limit = {"image": None} video_limit = {"video": None} if self.supports_video else {} - audio_limit = {"audio": None} if self.audio_extractor is not None else {} + audio_limit = {"audio": None} if self.supports_audio else {} return {**image_limit, **video_limit, **audio_limit} def get_data_parser(self): target_sr = None target_channels = None - if extractor := self.audio_extractor: - target_sr = extractor.sampling_rate + if self.sound_config: + target_sr = self.sound_config.sampling_rate target_channels = 1 return MultiModalDataParser( @@ -371,7 +374,7 @@ class NanoNemotronVLMultiModalProcessor( fields = self._get_image_fields_config(hf_inputs) if self.info.supports_video: fields |= self._get_video_fields_config(hf_inputs) - if self.info.audio_extractor: + if self.info.supports_audio: fields |= self._get_audio_fields_config(hf_inputs) return fields @@ -399,9 +402,8 @@ class NanoNemotronVLMultiModalProcessor( if isinstance(images, ImageEmbeddingItems): feature_size = images.get_feature_size(item_idx) - elif tiler := hf_processor.dynamic_tiler: - image = images.get(item_idx) - feature_size = tiler.get_cached_feature_size(image) + elif self.info.is_dynamic_tiler: + feature_size = out_mm_data["num_tokens_per_image"][item_idx] else: image_size = images.get_image_size(item_idx) max_num_tiles = hf_processor.max_num_tiles @@ -536,7 +538,7 @@ class NanoNemotronVLMultiModalProcessor( prompt_repls.append( self._get_prompt_repl_video(mm_items, hf_processor, out_mm_data) ) - if self.info.audio_extractor: + if self.info.supports_audio: prompt_repls.append( self._get_prompt_repl_audio(mm_items, hf_processor, out_mm_data) ) @@ -772,12 +774,14 @@ class NanoNemotronVLDummyInputsBuilder( else: dummy_video = {} - if extractor := self.info.audio_extractor: + if sound_config := self.info.sound_config: num_audios = mm_counts.get("audio", 0) audio_overrides = mm_options.get("audio") if mm_options else None tokens_per_audio = max(1, seq_len // max(num_audios, 1)) - max_audio_num_samples = MAX_AUDIO_LEN_S * extractor.sampling_rate - calculated_max_audio_num_samples = extractor.audio_length(tokens_per_audio) + max_audio_num_samples = MAX_AUDIO_LEN_S * sound_config.sampling_rate + calculated_max_audio_num_samples = ParakeetExtractor.audio_length( + sound_config, tokens_per_audio + ) audio_len = min(max_audio_num_samples, calculated_max_audio_num_samples) dummy_audio = { "audio": self._get_dummy_audios( @@ -1029,9 +1033,13 @@ class NemotronH_Nano_VL_V2( data=image_embeds, ) + pixel_values_flat = kwargs.pop("pixel_values_flat", None) + if pixel_values_flat is None: + return None + if self.dynamic_resolution: pixel_values_flat = DynamicResolutionImageTiler.stack( - kwargs.pop("pixel_values_flat"), self.patch_size + pixel_values_flat, self.patch_size ) return NanoNemotronVLImagePixelInputsDynamic( pixel_values_flat=pixel_values_flat, **kwargs @@ -1497,15 +1505,13 @@ class NemotronH_Nano_VL_V2( @classmethod def get_mamba_state_shape_from_config(cls, vllm_config: "VllmConfig"): text_config = vllm_config.model_config.hf_config.text_config - temp_vllm_config = copy.deepcopy(vllm_config) - temp_vllm_config.model_config.hf_config = text_config + temp_vllm_config = vllm_config.with_hf_config(text_config) return NemotronHForCausalLM.get_mamba_state_shape_from_config(temp_vllm_config) @classmethod def get_mamba_state_dtype_from_config(cls, vllm_config: "VllmConfig"): text_config = vllm_config.model_config.hf_config.text_config - temp_vllm_config = copy.deepcopy(vllm_config) - temp_vllm_config.model_config.hf_config = text_config + temp_vllm_config = vllm_config.with_hf_config(text_config) return NemotronHForCausalLM.get_mamba_state_dtype_from_config(temp_vllm_config) @classmethod diff --git a/vllm/model_executor/models/parakeet.py b/vllm/model_executor/models/parakeet.py index 1a3fd5bad..67c61dd80 100644 --- a/vllm/model_executor/models/parakeet.py +++ b/vllm/model_executor/models/parakeet.py @@ -159,5 +159,7 @@ class ParakeetExtractor(ParakeetFeatureExtractor): outputs["audio_num_clips"] = audio_num_clips return outputs - def audio_length(self, audio_tokens: int) -> int: - return int(audio_tokens * self.config.subsampling_factor * self.hop_length) + @staticmethod + def audio_length(raw_config: PretrainedConfig, audio_tokens: int) -> int: + config = ExtractorConfig.from_hf_config(raw_config) + return int(audio_tokens * config.subsampling_factor * config.hop_length) diff --git a/vllm/model_executor/models/radio.py b/vllm/model_executor/models/radio.py index 9d1a070ca..7ec320c53 100644 --- a/vllm/model_executor/models/radio.py +++ b/vllm/model_executor/models/radio.py @@ -176,7 +176,6 @@ class ViTPatchGenerator(nn.Module): temporal_patch_size=temporal_patch_size, **factory, ) - self._video_embedder_loaded = False if abs_pos: scale = embed_dim**-0.5 @@ -225,12 +224,7 @@ class ViTPatchGenerator(nn.Module): Returns: Embedded patches with temporal compression applied. """ - if not self._video_embedder_loaded: - raise ValueError( - "Temporal compression (video_temporal_patch_size > 1) requires " - "video_embedder weights, but they were never loaded. " - "Ensure the checkpoint was trained with temporal compression." - ) + assert self.temporal_patch_size > 1 T = self.temporal_patch_size input_size = x.shape[2:] @@ -794,9 +788,6 @@ class RadioModel(nn.Module): weight_loader(param, weight) loaded_params.add(vllm_key) - if "model.patch_generator.video_embedder.weight" in loaded_params: - self.model.patch_generator._video_embedder_loaded = True - return loaded_params def _extract_final( diff --git a/vllm/transformers_utils/configs/parakeet.py b/vllm/transformers_utils/configs/parakeet.py index 7c7a5ddd8..8309277b0 100644 --- a/vllm/transformers_utils/configs/parakeet.py +++ b/vllm/transformers_utils/configs/parakeet.py @@ -44,15 +44,19 @@ class ExtractorConfig: subsampling_factor: int subsampling_conv_kernel_size: int subsampling_conv_stride: int + hop_length: int = 160 + """Default `160`: Matches HF default""" clip_duration_s: int = 30 clip_min_duration_s: float = 0.1 @staticmethod def from_hf_config(config: PretrainedConfig) -> "ExtractorConfig": assert isinstance(config, PretrainedConfig) + hop_length = int(getattr(config, "hop_length", ExtractorConfig.hop_length)) return ExtractorConfig( feature_size=config.num_mel_bins, sampling_rate=config.sampling_rate, + hop_length=hop_length, subsampling_factor=config.subsampling_factor, subsampling_conv_kernel_size=config.subsampling_conv_kernel_size, subsampling_conv_stride=config.subsampling_conv_stride, diff --git a/vllm/transformers_utils/processors/nano_nemotron_vl.py b/vllm/transformers_utils/processors/nano_nemotron_vl.py index 594290c14..42659c8c1 100644 --- a/vllm/transformers_utils/processors/nano_nemotron_vl.py +++ b/vllm/transformers_utils/processors/nano_nemotron_vl.py @@ -356,15 +356,6 @@ class DynamicResolutionImageTiler: feature_sizes.append(param.num_embeddings) return images, feature_sizes - feature_size_cache: dict[Image.Image, int] = {} - - @classmethod - def get_cached_feature_size(cls, image: Image.Image) -> int: - feature_size = cls.feature_size_cache[id(image)] - # hard assert that we only use the feature size once - del cls.feature_size_cache[id(image)] - return feature_size - @dataclass class DynamicResolutionParams: media: Image.Image @@ -519,7 +510,6 @@ class DynamicResolutionImageTiler: param, token_count = self.process_media(media, tokens_for_media) params.append(param) token_counts.append(token_count) - self.feature_size_cache[id(param.media)] = param.num_embeddings # Step 2: Check if total tokens is within budget total_tokens = sum(token_counts) @@ -857,13 +847,12 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor): @property def supports_video(self) -> bool: - return self.video_token_id is not None + return True @property - def video_token_id(self) -> int | None: - if self.video_token is None: - return None - return self.tokenizer.get_vocab().get(self.video_token, None) + def video_token_id(self) -> int: + assert self.video_token is not None + return self.tokenizer.get_vocab()[self.video_token] @property def image_token_id(self) -> int: @@ -1055,6 +1044,13 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor): text_inputs = self.tokenizer(text, add_special_tokens=False) combined_inputs = {**text_inputs, **video_inputs, **audio_inputs} + frames_indices = combined_inputs.get("frames_indices") + ragged_frames_indices = ( + isinstance(frames_indices, list) + and len({len(frame_indices) for frame_indices in frames_indices}) > 1 + ) + if ragged_frames_indices: + combined_inputs.pop("frames_indices") if self.dynamic_tiler is None: batch = BatchFeature( @@ -1066,6 +1062,12 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor): # allow images to be exempt from the BatchFeature validation: # We will .stack() them in _parse_and_validate_image_input batch.update(image_inputs) + if ragged_frames_indices: + assert isinstance(frames_indices, list) + batch["frames_indices"] = [ + torch.as_tensor(frame_indices, dtype=torch.int64) + for frame_indices in frames_indices + ] return batch def get_image_repl(