[Model] Nemotron Parse 1.1 Support (#30864)

Signed-off-by: amitz-nv <203509407+amitz-nv@users.noreply.github.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
This commit is contained in:
amitz-nv
2026-01-05 23:00:14 +02:00
committed by GitHub
parent af1b07b0c5
commit ee21291825
13 changed files with 1117 additions and 31 deletions

View File

@@ -1220,7 +1220,7 @@ class NemotronH_Nano_VL_V2(
n = pixel_values.shape[0]
vit_embeds_list = []
for i in range(0, n, micro_batch_size):
vit_embeds = self.vision_model(pixel_values[i : i + micro_batch_size])
_, vit_embeds = self.vision_model(pixel_values[i : i + micro_batch_size])
vit_embeds = vit_embeds.to(dtype=torch.bfloat16)
h = w = int(vit_embeds.shape[1] ** 0.5)
vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
@@ -1695,12 +1695,7 @@ class NemotronH_Nano_VL_V2(
patch_size=patch_size,
norm_mean=hf_config.norm_mean,
norm_std=hf_config.norm_std,
reg_tokens=(
hf_config_vision.args.get("register_multiple")
if hasattr(hf_config_vision, "args")
and isinstance(hf_config_vision.args, dict)
else None
),
**hf_config_vision.args,
)
return RadioModel(config=radio_config)

View File

@@ -0,0 +1,958 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Adapted from https://github.com/amalad/vllm/blob/nemotron_parse/vllm/model_executor/models/nemotron_parse.py
# that's based on https://huggingface.co/nvidia/NVIDIA-Nemotron-Parse-v1.1/blob/main/hf_nemotron_parse_modeling.py
#
# Bart classes based on old vLLM codebase:
# https://github.com/vllm-project/vllm/blob/v0.10.2/vllm/model_executor/models/bart.py
import math
from collections.abc import Iterable, Mapping, Sequence
from typing import Annotated, Literal
import cv2
import numpy as np
import torch
import torch.nn as nn
from einops import rearrange
from PIL import Image
from timm.data.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
from torchvision import transforms as T
from transformers import (
BartConfig,
BatchFeature,
PretrainedConfig,
TensorType,
)
from vllm.attention.backends.abstract import AttentionType
from vllm.config import CacheConfig, VllmConfig
from vllm.config.lora import LoRAConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.logger import init_logger
from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead,
VocabParallelEmbedding,
)
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.interfaces import (
MultiModalEmbeddings,
SupportsMultiModal,
)
from vllm.model_executor.models.radio import RadioModel
from vllm.model_executor.models.whisper import WhisperAttention, WhisperCrossAttention
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig,
MultiModalKwargsItems,
)
from vllm.multimodal.parse import MultiModalDataItems
from vllm.multimodal.processing import (
BaseProcessingInfo,
EncDecMultiModalProcessor,
PromptReplacement,
PromptUpdate,
)
from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.transformers_utils.configs.radio import RadioConfig
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.utils.tensor_schema import TensorSchema, TensorShape
logger = init_logger(__name__)
DEFAULT_FINAL_IMAGE_SIZE = (2048, 1648)
class BartScaledWordEmbedding(VocabParallelEmbedding):
"""
This module overrides VocabParallelEmbedding's
forward by multiplying with embeddings scale.
"""
def __init__(
self, num_embeddings: int, embedding_dim: int, embed_scale: float = 1.0
):
super().__init__(num_embeddings, embedding_dim)
self.embed_scale = embed_scale
def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
return super().forward(input_ids) * self.embed_scale
class BartParallelLMHead(ParallelLMHead):
"""
This module overrides ParallelLMHead's
forward by dividing by embeddings scale,
yielding effectively the inverse of
BartScaledWordEmbedding
"""
def __init__(
self, num_embeddings: int, embedding_dim: int, embed_scale: float = 1.0
):
super().__init__(num_embeddings, embedding_dim)
self.embed_scale = embed_scale
def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
return super().forward(input_ids) / self.embed_scale
class BartDecoderLayer(nn.Module):
def __init__(
self,
config: BartConfig,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
prefix: str = "",
):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = WhisperAttention(
embed_dim=self.embed_dim,
num_heads=config.decoder_attention_heads,
attn_type=AttentionType.DECODER,
cache_config=cache_config,
quant_config=quant_config,
prefix=f"{prefix}.self_attn",
)
self.activation_fn = get_act_fn(config.activation_function)
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
"""
afeldman-nm: personally I would call this "cross-attention",
however I left the name as "encoder_attn" to maintain consistency
with the name of the pretrained weights.
"""
self.encoder_attn = WhisperCrossAttention(
self.embed_dim,
config.decoder_attention_heads,
cache_config=cache_config,
quant_config=quant_config,
prefix=f"{prefix}.encoder_attn",
)
self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
ffn_hidden_size = self.embed_dim
ffn_intermediate_size = config.encoder_ffn_dim
ffn_has_bias = True
self.fc1 = ColumnParallelLinear(
ffn_hidden_size,
ffn_intermediate_size,
bias=ffn_has_bias,
quant_config=quant_config,
prefix=f"{prefix}.fc1",
)
self.fc2 = RowParallelLinear(
ffn_intermediate_size,
ffn_hidden_size,
bias=ffn_has_bias,
quant_config=quant_config,
prefix=f"{prefix}.fc2",
)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
decoder_hidden_states: torch.Tensor,
encoder_hidden_states: torch.Tensor | None = None,
) -> torch.Tensor:
r"""
Args:
decoder_hidden_states: torch.Tensor of *decoder* input embeddings.
encoder_hidden_states: torch.Tensor of *encoder* input embeddings.
Returns:
Decoder layer output torch.Tensor
"""
residual = decoder_hidden_states
# Self Attention
hidden_states = self.self_attn(hidden_states=decoder_hidden_states)
hidden_states = residual + hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
# Cross-Attention Block
residual = hidden_states
hidden_states = self.encoder_attn(
hidden_states=hidden_states,
encoder_hidden_states=encoder_hidden_states,
)
hidden_states = residual + hidden_states
hidden_states = self.encoder_attn_layer_norm(hidden_states)
# Fully Connected
residual = hidden_states
fc1_out, _ = self.fc1(hidden_states)
hidden_states = self.activation_fn(fc1_out)
hidden_states, _ = self.fc2(hidden_states)
hidden_states = residual + hidden_states
hidden_states = self.final_layer_norm(hidden_states)
return hidden_states
class MBartDecoderLayer(BartDecoderLayer):
def forward(
self,
decoder_hidden_states: torch.Tensor,
encoder_hidden_states: torch.Tensor | None = None,
) -> torch.Tensor:
residual = decoder_hidden_states
hidden_states = self.self_attn_layer_norm(decoder_hidden_states)
# Self Attention
hidden_states = self.self_attn(hidden_states=hidden_states)
hidden_states = residual + hidden_states
# Cross-Attention Block
residual = hidden_states
hidden_states = self.encoder_attn_layer_norm(hidden_states)
hidden_states = self.encoder_attn(
hidden_states=hidden_states,
encoder_hidden_states=encoder_hidden_states,
)
hidden_states = residual + hidden_states
# Fully Connected
residual = hidden_states
hidden_states = self.final_layer_norm(hidden_states)
fc1_out, _ = self.fc1(hidden_states)
hidden_states = self.activation_fn(fc1_out)
hidden_states, _ = self.fc2(hidden_states)
hidden_states = residual + hidden_states
return hidden_states
class MBartDecoderNoPos(nn.Module):
"""
Transformer decoder consisting of *config.decoder_layers* layers.
Each layer is a [`BartDecoderLayer`]
Args:
config: BartConfig
embed_tokens (nn.Embedding): output embedding
"""
def __init__(
self,
config: BartConfig,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
lora_config: LoRAConfig | None = None,
embed_tokens: nn.Embedding | None = None,
prefix: str = "",
):
super().__init__()
self.cache_config = cache_config
self.quant_config = quant_config
self.lora_config = lora_config
embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
self.embed_tokens = BartScaledWordEmbedding(
config.vocab_size, config.d_model, embed_scale=embed_scale
)
if embed_tokens is not None:
self.embed_tokens.weight = embed_tokens.weight
self.layers = nn.ModuleList(
[
MBartDecoderLayer(
config,
cache_config,
quant_config,
prefix=f"{prefix}.layers.{layer_idx}",
)
for layer_idx in range(config.decoder_layers)
]
)
self.layernorm_embedding = nn.LayerNorm(config.d_model)
self.layer_norm = nn.LayerNorm(config.d_model)
def forward(
self,
decoder_input_ids: torch.Tensor,
*,
encoder_hidden_states: torch.Tensor | None,
inputs_embeds: torch.Tensor | None = None,
**kwargs,
) -> torch.Tensor:
r"""
Args:
decoder_input_ids: Indices of *decoder* input sequence tokens in the
vocabulary. Padding will be ignored by default should you provide it.
encoder_hidden_states: Tensor of encoder output embeddings
Returns:
Decoder output torch.Tensor
"""
if inputs_embeds is None:
inputs_embeds = self.embed_tokens(decoder_input_ids)
hidden_states = self.layernorm_embedding(inputs_embeds)
# decoder layers
for decoder_layer in self.layers:
hidden_states = decoder_layer(
decoder_hidden_states=hidden_states,
encoder_hidden_states=encoder_hidden_states,
)
hidden_states = self.layer_norm(hidden_states)
return hidden_states
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
(".self_attn.qkv_proj", ".self_attn.q_proj", "q"),
(".self_attn.qkv_proj", ".self_attn.k_proj", "k"),
(".self_attn.qkv_proj", ".self_attn.v_proj", "v"),
(".encoder_attn.kv_proj", ".encoder_attn.k_proj", "k"),
(".encoder_attn.kv_proj", ".encoder_attn.v_proj", "v"),
]
params_dict = dict(self.named_parameters())
loaded_params: set[str] = set()
for name, loaded_weight in weights:
if name.startswith("embed_positions"):
continue
for param_name, weight_name, shard_id in stacked_params_mapping:
if weight_name not in name:
continue
name = name.replace(weight_name, param_name)
# Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict:
continue
param = params_dict[name]
weight_loader = param.weight_loader
weight_loader(param, loaded_weight, shard_id)
break
else:
# Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict:
continue
param = params_dict[name]
weight_loader = getattr(param, "weight_loader", default_weight_loader)
weight_loader(param, loaded_weight)
loaded_params.add(name)
return loaded_params
class NemotronParsePixelInputs(TensorSchema):
"""
Dimensions:
- b: Batch size
- c: Number of channels (3)
- h: Height
- w: Width
"""
type: Literal["pixel_values"]
data: Annotated[torch.Tensor, TensorShape("b", 3, "h", "w")]
class NemotronParseImageProcessor:
"""
NemotronParse Image Processor
"""
def __init__(
self,
final_size: tuple = DEFAULT_FINAL_IMAGE_SIZE,
**kwargs,
):
# Ensure final_size is properly formatted
if isinstance(final_size, (list, tuple)) and len(final_size) >= 2:
self.final_size = (int(final_size[0]), int(final_size[1]))
elif isinstance(final_size, (int, float)):
self.final_size = (int(final_size), int(final_size))
else:
self.final_size = DEFAULT_FINAL_IMAGE_SIZE # Default fallback
self.norm_mean = torch.Tensor(OPENAI_CLIP_MEAN).reshape(1, 3, 1, 1)
self.norm_std = torch.Tensor(OPENAI_CLIP_STD).reshape(1, 3, 1, 1)
# Create transforms
self._create_transforms()
def _create_transforms(self):
"""Create transform objects."""
try:
import albumentations as A
except ImportError as err:
raise ImportError(
"The package `albumentations` is required to use "
"NemotronParse model. Please install it with `pip install "
"albumentations`."
) from err
# Ensure final_size is a tuple of integers
if isinstance(self.final_size, (list, tuple)):
self.target_height, self.target_width = (
int(self.final_size[0]),
int(self.final_size[1]),
)
else:
self.target_height = self.target_width = int(self.final_size)
self.transform = A.Compose(
[
A.PadIfNeeded(
min_height=self.target_height,
min_width=self.target_width,
border_mode=cv2.BORDER_CONSTANT,
fill=[255, 255, 255],
p=1.0,
),
]
)
self.torch_transform = T.Compose(
[
T.ToTensor(),
]
)
def _resize_with_aspect_ratio(self, image: np.ndarray) -> np.ndarray:
"""Resize image maintaining aspect ratio (exact replica of original
LongestMaxSizeHW)."""
height, width = image.shape[:2]
max_size_height = self.target_height
max_size_width = self.target_width
# Original LongestMaxSizeHW algorithm from custom_augmentations.py
aspect_ratio = width / height
new_height = height
new_width = width
# If height too big then scale image down
if height > max_size_height:
new_height = max_size_height
new_width = int(new_height * aspect_ratio)
# If width too big, scale image down further
if new_width > max_size_width:
new_width = max_size_width
new_height = int(new_width / aspect_ratio)
# Use cv2.INTER_LINEAR like the original
return cv2.resize(
image, (new_width, new_height), interpolation=cv2.INTER_LINEAR
)
def _pad_to_size(self, image: np.ndarray) -> np.ndarray:
"""Pad image to target size with white padding (matches A.PadIfNeeded
behavior)."""
h, w = image.shape[:2]
min_height, min_width = self.target_height, self.target_width
# Only pad if image is smaller than target (matches A.PadIfNeeded logic)
pad_h = max(0, min_height - h)
pad_w = max(0, min_width - w)
if pad_h == 0 and pad_w == 0:
return image
# A.PadIfNeeded pads to bottom-right with constant value
if len(image.shape) == 3:
# Color image - pad bottom and right with white (255, 255, 255)
padded = np.pad(
image,
((0, pad_h), (0, pad_w), (0, 0)),
mode="constant",
constant_values=255,
)
else:
# Grayscale image - pad with white (255)
padded = np.pad(
image, ((0, pad_h), (0, pad_w)), mode="constant", constant_values=255
)
return padded
def preprocess(
self,
images: Image.Image | list[Image.Image],
**kwargs,
) -> dict[str, torch.Tensor]:
"""
Preprocess an image or batch of images for the NemotronParse model.
Args:
images: Input image(s)
"""
# Ensure images is a list
if not isinstance(images, list):
images = [images]
# Convert PIL images to numpy arrays if needed
processed_images = []
for image in images:
if isinstance(image, Image.Image):
image = np.asarray(image)
processed_images.append(image)
# Apply NemotronParse-specific transforms
pixel_values = []
for image in processed_images:
# Manual resize with aspect ratio preservation
# (replaces LongestMaxSizeHW)
processed_image = self._resize_with_aspect_ratio(image)
# Apply remaining albumentations transforms if available
if self.transform is not None:
transformed = self.transform(image=processed_image)
processed_image = transformed["image"]
else:
# Fallback: just pad to target size
processed_image = self._pad_to_size(processed_image)
# Convert to tensor
pixel_values_tensor = self.torch_transform(processed_image)
# Handle grayscale images
if pixel_values_tensor.shape[0] == 1:
pixel_values_tensor = pixel_values_tensor.expand(3, -1, -1)
pixel_values.append(pixel_values_tensor)
# Stack into batch
pixel_values = torch.stack(pixel_values)
# Normalize pixel values
normalized_values = (pixel_values - self.norm_mean) / self.norm_std
return {"pixel_values": normalized_values}
def __call__(
self, images: Image.Image | list[Image.Image], **kwargs
) -> dict[str, torch.Tensor]:
return self.preprocess(images, **kwargs)
class NemotronParseProcessor:
"""
NemotronParse Processor
"""
def __init__(
self,
config: PretrainedConfig,
tokenizer: AnyTokenizer,
**kwargs,
) -> None:
super().__init__()
self.config = config
self.tokenizer = tokenizer
self.image_processor = NemotronParseImageProcessor(final_size=config.image_size)
def _make_batch_input(self, input_item=None):
if input_item is None:
input_item = []
if not isinstance(input_item, list):
input_item = [input_item]
return input_item
def __call__(
self,
text: str | None = None,
images: Image.Image | list[Image.Image] | None = None,
return_tensors: str | TensorType | None = None,
**kwargs,
) -> BatchFeature:
text, images = [self._make_batch_input(x) for x in (text, images)]
image_inputs = {} if len(images) == 0 else self.image_processor(images)
text_inputs = self.tokenizer(text, add_special_tokens=False, **kwargs)
combined_outputs = BatchFeature(
data={**text_inputs, **image_inputs},
tensor_type=return_tensors,
)
return combined_outputs
class NemotronParseProcessingInfo(BaseProcessingInfo):
def get_hf_config(self):
return self.ctx.get_hf_config()
def get_hf_processor(self, **kwargs) -> NemotronParseProcessor:
return self.ctx.init_processor(
NemotronParseProcessor,
config=self.get_hf_config(),
tokenizer=self.get_tokenizer(),
**kwargs,
)
def get_supported_mm_limits(self) -> Mapping[str, int | None]:
return {"image": 1}
def get_num_image_tokens(self) -> int:
config = self.get_hf_config()
final_size = config.image_size
patch_size = config.encoder.patch_size
return (final_size[0] // patch_size) * ((final_size[1] // patch_size) // 4) + 1
def get_mm_max_tokens_per_item(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> Mapping[str, int] | None:
image_tokens = self.get_num_image_tokens()
return {"image": image_tokens}
class NemotronParseDummyInputsBuilder(
BaseDummyInputsBuilder[NemotronParseProcessingInfo]
):
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
return ""
def get_dummy_mm_data(
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)
target_width, target_height = self.info.get_hf_config().image_size
return {
"image": self._get_dummy_images(
width=target_width, height=target_height, num_images=num_images
)
}
class NemotronParseMultiModalProcessor(
EncDecMultiModalProcessor[NemotronParseProcessingInfo]
):
def create_encoder_prompt(
self,
prompt: str | list[int],
mm_data: MultiModalDataDict,
) -> str | list[int]:
return [0]
@property
def pad_dummy_encoder_prompt(self) -> bool:
return True
def _call_hf_processor(
self,
prompt: str,
mm_data: Mapping[str, object],
mm_kwargs: Mapping[str, object],
tok_kwargs: Mapping[str, object],
) -> BatchFeature:
if mm_data:
processed_outputs = super()._call_hf_processor(
prompt, mm_data, mm_kwargs, tok_kwargs
)
else:
hf_processor = self.info.get_hf_processor()
tokenizer = hf_processor.tokenizer
processed_outputs = tokenizer(
prompt, add_special_tokens=False, return_tensors="pt"
)
return processed_outputs
def _get_mm_fields_config(
self,
hf_inputs: BatchFeature,
hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]:
return dict(pixel_values=MultiModalFieldConfig.batched("image"))
def _get_prompt_updates(
self,
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]:
num_image_tokens = self.info.get_num_image_tokens()
return [
PromptReplacement(
modality="image",
target=[0],
replacement=[0] * num_image_tokens,
)
]
class RadioWithNeck(nn.Module):
"""Vision encoder using RADIO model with custom neck."""
def __init__(
self,
config: PretrainedConfig,
quant_config: QuantizationConfig | None = None,
prefix: str = "",
):
super().__init__()
self.config = config.encoder
self.model_encoder = self.get_vit_model_from_radio_config(
config, quant_config=quant_config
)
# Neck components
last_hidden_state = 1024
self.conv1 = nn.Conv1d(1280, last_hidden_state, 1)
self.layer_norm1 = nn.LayerNorm(
last_hidden_state, eps=1e-06, elementwise_affine=True
)
self.conv2 = nn.Conv2d(
last_hidden_state,
last_hidden_state,
kernel_size=(1, 4),
stride=(1, 4),
padding=0,
bias=False,
)
self.layer_norm2 = nn.LayerNorm(
last_hidden_state, eps=1e-06, elementwise_affine=True
)
self.sum_proj = ColumnParallelLinear(
3840,
last_hidden_state,
quant_config=quant_config,
prefix=f"{prefix}.sum_proj",
)
self.layer_norm3 = nn.LayerNorm(
last_hidden_state, eps=1e-06, elementwise_affine=True
)
def get_vit_model_from_radio_config(
self,
hf_config: PretrainedConfig,
quant_config: QuantizationConfig | None = None,
) -> RadioModel:
hf_config_vision = hf_config.encoder
model_name = hf_config_vision.args.get("model")
if model_name is None:
raise ValueError(f"Unsupported vit model type: {model_name}")
radio_config = RadioConfig(
model_name=model_name,
image_size=hf_config.image_size,
**hf_config_vision.args,
)
return RadioModel(config=radio_config, quant_config=quant_config)
def forward(self, pixel_values: torch.Tensor, **kwargs) -> torch.Tensor:
summary, feature = self.model_encoder(pixel_values)
output = self.conv1(feature.permute(0, 2, 1)).permute(0, 2, 1)
output = self.layer_norm1(output)
patch_size = self.config.patch_size
output = rearrange(
output,
"b (h w) d -> b d h w",
h=pixel_values.shape[-2] // patch_size,
w=pixel_values.shape[-1] // patch_size,
)
output = self.conv2(output)
output = rearrange(output, "b d h w -> b (h w) d")
output = self.layer_norm2(output)
summary = self.layer_norm3(self.sum_proj(summary)[0])
output = torch.cat((output, summary.unsqueeze(1)), dim=1)
return output
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
model_encoder_weights = []
adaptor_dict = {
name: param
for name, param in dict(self.named_parameters()).items()
if not name.startswith("model_encoder")
}
for name, w in weights:
if name.startswith("model_encoder"):
model_encoder_weights.append((".".join(name.split(".")[1:]), w))
else:
param = adaptor_dict[name]
with torch.no_grad():
default_weight_loader(param, w)
self.model_encoder.load_weights(model_encoder_weights)
@MULTIMODAL_REGISTRY.register_processor(
NemotronParseMultiModalProcessor,
info=NemotronParseProcessingInfo,
dummy_inputs=NemotronParseDummyInputsBuilder,
)
class NemotronParseForConditionalGeneration(nn.Module, SupportsMultiModal):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
config = vllm_config.model_config.hf_config
self.config = config
self.vision_config = config.encoder
cache_config = vllm_config.cache_config
quant_config = vllm_config.quant_config
self.encoder = RadioWithNeck(
config=config, quant_config=quant_config, prefix=f"{prefix}.encoder"
)
self.decoder = MBartDecoderNoPos(
config.decoder,
cache_config=cache_config,
quant_config=quant_config,
prefix=f"{prefix}.decoder",
)
self.vocab_size = config.decoder.vocab_size
self.lm_head = ParallelLMHead(
config.decoder.vocab_size, config.decoder.d_model, quant_config=quant_config
)
self.logits_processor = LogitsProcessor(
self.vocab_size, config.decoder.vocab_size
)
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
if modality.startswith("image"):
return None
raise ValueError("Only image modality is supported")
def _parse_and_validate_image_input(
self, **kwargs: object
) -> NemotronParsePixelInputs | None:
pixel_values = kwargs.pop("pixel_values", None)
image_embeds = kwargs.pop("image_embeds", None)
if pixel_values is None and image_embeds is None:
return None
if pixel_values is not None and image_embeds is not None:
raise ValueError("Both pixel values and image embeds are provided.")
if pixel_values is not None:
h, w = self.config.image_size
return NemotronParsePixelInputs(
type="pixel_values",
data=pixel_values,
resolve_bindings={
"h": h,
"w": w,
},
)
if image_embeds is not None:
raise NotImplementedError
raise AssertionError("This line should be unreachable.")
def _process_image_input(
self, image_input: NemotronParsePixelInputs
) -> torch.Tensor:
assert image_input["type"] == "pixel_values"
pixel_values = image_input["data"]
dtype = next(self.encoder.parameters()).dtype
pixel_values = pixel_values.to(dtype)
return self.encoder(pixel_values)
def get_language_model(self) -> torch.nn.Module:
return self.decoder
def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
vision_embeddings = self._process_image_input(image_input)
return vision_embeddings
def forward(
self,
input_ids: torch.Tensor,
positions: torch.Tensor,
encoder_outputs: list[torch.Tensor] | None = None,
**kwargs,
) -> torch.Tensor:
r"""
Args:
input_ids: torch.Tensor of *decoder* input token ids.
positions: torch.Tensor of *decoder* position indices.
encoder_outputs: List of encoder output tensors (vision embeddings).
During profiling, this may be None or empty.
Returns:
Output torch.Tensor
"""
inputs_embeds = None
if encoder_outputs:
inputs_embeds = torch.cat(encoder_outputs, dim=0)
hidden_states = self.decoder(
decoder_input_ids=input_ids, encoder_hidden_states=inputs_embeds
)
return hidden_states
def compute_logits(
self,
hidden_states: torch.Tensor,
) -> torch.Tensor | None:
return self.logits_processor(self.lm_head, hidden_states)
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
lm_head_dict = dict(self.lm_head.named_parameters())
def is_encoder(name: str) -> bool:
return name.startswith("encoder")
def is_decoder(name: str) -> bool:
return name.startswith("decoder")
def is_lm_head(name: str):
return name.startswith("lm_head")
# Separate weights by component
encoder_weights = []
decoder_weights = []
for name, w in weights:
if is_encoder(name):
encoder_weights.append((".".join(name.split(".")[1:]), w))
elif is_decoder(name):
decoder_weights.append((".".join(name.split(".")[1:]), w))
elif is_lm_head(name):
trimmed_name = ".".join(name.split(".")[1:])
param = lm_head_dict[trimmed_name]
with torch.no_grad():
default_weight_loader(param, w)
else:
logger.info("Found unexpected weight: %s", name)
# Load encoder weights
self.encoder.load_weights(encoder_weights)
# Load decoder weights
self.decoder.load_weights(decoder_weights)

View File

@@ -427,15 +427,17 @@ class RadioInternVisionModel(nn.Module):
to_2tuple(config.patch_size), config.image_size
)
max_img_size = int(
round(config.max_img_size / config.patch_size) * config.patch_size
round(config.cpe_max_size / config.patch_size) * config.patch_size
)
unique_teachers = set(t["name"] for t in config.teachers)
self.patch_generator = ViTPatchGenerator(
config.patch_size,
config.hidden_size,
input_dims=self.img_size,
max_input_dims=max_img_size,
cls_token=True,
register_multiple=config.reg_tokens,
num_cls_tokens=len(unique_teachers) if config.cls_token_per_teacher else 1,
register_multiple=config.register_multiple,
)
self.encoder = InternVisionEncoder(
@@ -489,11 +491,20 @@ class RadioModel(nn.Module):
prefix=prefix,
)
summary_idxs = None
if config.teachers:
summary_idxs = torch.tensor(
[i for i, t in enumerate(config.teachers) if t.get("use_summary", True)]
)
if summary_idxs.numel() > 0:
self.register_buffer("summary_idxs", summary_idxs)
self.summary_idxs = summary_idxs
def forward(
self,
pixel_values: torch.Tensor | None = None,
pixel_embeds: torch.Tensor | None = None,
) -> torch.FloatTensor:
) -> tuple[torch.FloatTensor, torch.FloatTensor]:
y = self.model(pixel_values)
return self._extract_final(y)
@@ -546,10 +557,17 @@ class RadioModel(nn.Module):
return loaded_params
def _extract_final(self, y: torch.Tensor):
def _extract_final(
self, y: torch.Tensor
) -> tuple[torch.FloatTensor, torch.FloatTensor]:
# Remove CLS + REGISTERS tokens
patch_gen = getattr(self.model, "patch_generator", None)
if patch_gen is not None:
all_summary = y[:, : patch_gen.num_cls_tokens]
if self.summary_idxs is not None:
bb_summary = all_summary[:, self.summary_idxs]
else:
bb_summary = all_summary
all_feat = y[:, patch_gen.num_skip :]
return all_feat
return bb_summary.flatten(1), all_feat

View File

@@ -428,6 +428,10 @@ _MULTIMODAL_MODELS = {
"VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"), # noqa: E501
"VoxtralStreamingGeneration": ("voxtral_streaming", "VoxtralStreamingGeneration"), # noqa: E501
# [Encoder-decoder]
"NemotronParseForConditionalGeneration": (
"nemotron_parse",
"NemotronParseForConditionalGeneration",
),
"WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"), # noqa: E501
}