diff --git a/vllm/model_executor/layers/attention/mm_encoder_attention.py b/vllm/model_executor/layers/attention/mm_encoder_attention.py index 44e990d29..35c10ec0b 100644 --- a/vllm/model_executor/layers/attention/mm_encoder_attention.py +++ b/vllm/model_executor/layers/attention/mm_encoder_attention.py @@ -4,7 +4,6 @@ import torch -from vllm.config import MultiModalConfig from vllm.logger import init_logger from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.models.vision import get_vit_attn_backend @@ -32,7 +31,6 @@ class MMEncoderAttention(CustomOp): scale: float | None = None, num_kv_heads: int | None = None, prefix: str = "", - multimodal_config: MultiModalConfig | None = None, ) -> None: """ Args: @@ -42,7 +40,6 @@ class MMEncoderAttention(CustomOp): num_kv_heads: number of kv heads. prefix: This has no effect, it is only here to make it easier to swap between Attention and MultiHeadAttention - multimodal_config: configs for multi-modal. """ super().__init__() @@ -62,16 +59,10 @@ class MMEncoderAttention(CustomOp): # weight and activation dtype. dtype = torch.get_default_dtype() - # Try to get vision attention backend from multimodal_config. - attn_backend_override = None - if multimodal_config is not None: - attn_backend_override = multimodal_config.mm_encoder_attn_backend - # Get device-specific vision attention backend. self.attn_backend = get_vit_attn_backend( head_size=head_size, dtype=dtype, - attn_backend_override=attn_backend_override, ) self.is_flash_attn_backend = self.attn_backend in { diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index c0ee04560..481f5ae6d 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -16,7 +16,7 @@ from transformers import ( from vllm.attention.layer import Attention from vllm.config import VllmConfig -from vllm.config.multimodal import BaseDummyOptions, MultiModalConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention @@ -59,6 +59,7 @@ from .vision import ( VisionFeatureSelectStrategy, VisionFeatureSelectStrategyStr, get_num_selected_vision_tokens, + is_vit_use_data_parallel, resolve_visual_encoder_outputs, ) @@ -353,7 +354,6 @@ class CLIPAttention(nn.Module): self, config: CLIPTextConfig | CLIPVisionConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, *, prefix: str = "", attn_cls: type[Attention] | type[MMEncoderAttention], @@ -372,11 +372,7 @@ class CLIPAttention(nn.Module): ) self.scale = self.head_dim**-0.5 - use_data_parallel = ( - multimodal_config.mm_encoder_tp_mode == "data" - if multimodal_config - else False - ) + use_data_parallel = is_vit_use_data_parallel() self.qkv_proj = QKVParallelLinear( hidden_size=self.embed_dim, head_size=self.head_dim, @@ -405,7 +401,6 @@ class CLIPAttention(nn.Module): self.head_dim, self.scale, prefix=f"{prefix}.attn", - multimodal_config=multimodal_config, ) else: self.attn = attn_cls( @@ -434,17 +429,12 @@ class CLIPMLP(nn.Module): self, config: CLIPTextConfig | CLIPVisionConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ) -> None: super().__init__() self.config = config - use_data_parallel = ( - multimodal_config.mm_encoder_tp_mode == "data" - if multimodal_config - else False - ) + use_data_parallel = is_vit_use_data_parallel() self.activation_fn = get_act_fn(config.hidden_act) self.fc1 = ColumnParallelLinear( @@ -477,7 +467,6 @@ class CLIPEncoderLayer(nn.Module): self, config: CLIPTextConfig | CLIPVisionConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, *, prefix: str = "", attn_cls: type[Attention] | type[MMEncoderAttention], @@ -487,7 +476,6 @@ class CLIPEncoderLayer(nn.Module): self.self_attn = CLIPAttention( config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.self_attn", attn_cls=attn_cls, ) @@ -495,7 +483,6 @@ class CLIPEncoderLayer(nn.Module): self.mlp = CLIPMLP( config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.mlp", ) self.layer_norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) @@ -528,7 +515,6 @@ class CLIPEncoder(nn.Module): self, config: CLIPTextConfig | CLIPVisionConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, num_hidden_layers_override: int | None = None, *, prefix: str = "", @@ -548,7 +534,6 @@ class CLIPEncoder(nn.Module): CLIPEncoderLayer( config=config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.layers.{layer_idx}", attn_cls=attn_cls, ) @@ -658,7 +643,6 @@ class CLIPVisionTransformer(nn.Module): self, config: CLIPVisionConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, *, num_hidden_layers_override: int | None = None, require_post_norm: bool | None = None, @@ -678,7 +662,6 @@ class CLIPVisionTransformer(nn.Module): self.encoder = CLIPEncoder( config=config, quant_config=quant_config, - multimodal_config=multimodal_config, num_hidden_layers_override=num_hidden_layers_override, prefix=f"{prefix}.encoder", attn_cls=MMEncoderAttention, @@ -780,7 +763,6 @@ class CLIPVisionModel(nn.Module): self, config: CLIPVisionConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, *, num_hidden_layers_override: int | None = None, require_post_norm: bool | None = None, @@ -791,7 +773,6 @@ class CLIPVisionModel(nn.Module): self.vision_model = CLIPVisionTransformer( config=config, quant_config=quant_config, - multimodal_config=multimodal_config, num_hidden_layers_override=num_hidden_layers_override, require_post_norm=require_post_norm, prefix=f"{prefix}.vision_model", @@ -869,7 +850,6 @@ class CLIPEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant): self.vision_model = CLIPVisionTransformer( vision_config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=maybe_prefix(prefix, "vision_model"), ) self.visual_projection = nn.Linear( diff --git a/vllm/model_executor/models/deepencoder.py b/vllm/model_executor/models/deepencoder.py index b3e5d920e..46ce87b34 100644 --- a/vllm/model_executor/models/deepencoder.py +++ b/vllm/model_executor/models/deepencoder.py @@ -18,7 +18,6 @@ import torch.nn as nn import torch.nn.functional as F from transformers import CLIPVisionConfig -from vllm.config import MultiModalConfig from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.quantization import QuantizationConfig @@ -609,7 +608,6 @@ class DeepCLIPVisionTransformer(nn.Module): self, config: CLIPVisionConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, *, num_hidden_layers_override: int | None = None, prefix: str = "", @@ -628,7 +626,6 @@ class DeepCLIPVisionTransformer(nn.Module): self.transformer = CLIPEncoder( config=config, quant_config=quant_config, - multimodal_config=multimodal_config, num_hidden_layers_override=num_hidden_layers_override, prefix=f"{prefix}.encoder", attn_cls=MMEncoderAttention, diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py index 6f7c13193..bfbf06467 100644 --- a/vllm/model_executor/models/deepseek_ocr.py +++ b/vllm/model_executor/models/deepseek_ocr.py @@ -398,7 +398,6 @@ class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, Supports self.vision_model = DeepCLIPVisionTransformer( config=clip_vision_config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=maybe_prefix(prefix, "vision_model"), ) diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py index ca360210b..c44992a9a 100644 --- a/vllm/model_executor/models/dots_ocr.py +++ b/vllm/model_executor/models/dots_ocr.py @@ -8,7 +8,7 @@ import torch.nn as nn from torch.nn import LayerNorm from transformers.models.qwen2_vl import Qwen2VLProcessor -from vllm.config import MultiModalConfig, VllmConfig +from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import utils as dist_utils from vllm.distributed.parallel_state import ( @@ -60,7 +60,7 @@ from vllm.transformers_utils.configs.dotsocr import DotsOCRConfig, DotsVisionCon from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.v1.attention.backends.registry import AttentionBackendEnum -from .vision import run_dp_sharded_mrope_vision_model +from .vision import is_vit_use_data_parallel, run_dp_sharded_mrope_vision_model IMAGE_TOKEN = "<|imgpad|>" @@ -183,9 +183,9 @@ class PatchMerger(nn.Module): spatial_merge_size: int = 2, pre_norm="layernorm", prefix: str = "", - use_data_parallel: bool = False, ) -> None: super().__init__() + use_data_parallel = is_vit_use_data_parallel() self.hidden_size = context_dim * (spatial_merge_size**2) self.pre_norm = pre_norm if self.pre_norm == "layernorm": @@ -230,15 +230,10 @@ class DotsVisionAttention(nn.Module): bias: bool = True, *, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ) -> None: super().__init__() - use_data_parallel = ( - multimodal_config.mm_encoder_tp_mode == "data" - if multimodal_config - else False - ) + use_data_parallel = is_vit_use_data_parallel() self.embed_dim = dim self.tp_size = ( @@ -272,7 +267,6 @@ class DotsVisionAttention(nn.Module): num_heads=self.num_attention_heads_per_partition, head_size=self.hidden_size_per_attention_head, scale=self.hidden_size_per_attention_head**-0.5, - multimodal_config=multimodal_config, prefix=f"{prefix}.attn", ) @@ -329,7 +323,6 @@ class DotsSwiGLUFFN(nn.Module): config, *, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ): super().__init__() @@ -337,11 +330,7 @@ class DotsSwiGLUFFN(nn.Module): in_features = config.embed_dim bias = config.use_bias - use_data_parallel = ( - multimodal_config.mm_encoder_tp_mode == "data" - if multimodal_config - else False - ) + use_data_parallel = is_vit_use_data_parallel() # Referenced aimv2.py AIMv2SwiGLUFFN self.fc13 = MergedColumnParallelLinear( in_features, @@ -447,7 +436,6 @@ class DotsVisionBlock(nn.Module): config, *, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ): super().__init__() @@ -458,14 +446,12 @@ class DotsVisionBlock(nn.Module): num_heads=config.num_attention_heads, bias=config.use_bias, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.attn", ) self.norm1 = RMSNorm(config.embed_dim, eps=config.rms_norm_eps) self.mlp = DotsSwiGLUFFN( config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.mlp", ) self.norm2 = RMSNorm(config.embed_dim, eps=config.rms_norm_eps) @@ -493,7 +479,6 @@ class DotsVisionTransformer(nn.Module): self, config: DotsVisionConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, *, num_hidden_layers_override: int | None = None, require_post_norm: bool | None = None, @@ -507,15 +492,9 @@ class DotsVisionTransformer(nn.Module): head_dim = config.embed_dim // config.num_attention_heads self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2) - attn_backend_override = ( - multimodal_config.mm_encoder_attn_backend - if multimodal_config is not None - else None - ) self.attn_backend = get_vit_attn_backend( head_size=head_dim, dtype=torch.get_default_dtype(), - attn_backend_override=attn_backend_override, ) self.out_hidden_size = config.hidden_size # Keep blocks for compatibility with other vision towers @@ -529,7 +508,6 @@ class DotsVisionTransformer(nn.Module): DotsVisionBlock( config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.blocks.{i}", ) for i in range(num_layers) @@ -542,16 +520,10 @@ class DotsVisionTransformer(nn.Module): else: self.post_trunk_norm = None - use_data_parallel = ( - multimodal_config.mm_encoder_tp_mode == "data" - if multimodal_config - else False - ) self.merger = PatchMerger( dim=config.hidden_size, context_dim=config.embed_dim, spatial_merge_size=config.spatial_merge_size, - use_data_parallel=use_data_parallel, ) @property @@ -693,7 +665,6 @@ class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA self.vision_tower = DotsVisionTransformer( vision_config, quant_config=self.quant_config, - multimodal_config=multimodal_config, prefix=maybe_prefix(prefix, "vision_tower"), ) diff --git a/vllm/model_executor/models/eagle2_5_vl.py b/vllm/model_executor/models/eagle2_5_vl.py index e33abbe5b..419522ade 100644 --- a/vllm/model_executor/models/eagle2_5_vl.py +++ b/vllm/model_executor/models/eagle2_5_vl.py @@ -270,7 +270,6 @@ class Eagle2_5_VLForConditionalGeneration( return SiglipVisionModel( vision_config, quant_config=quant_config, - multimodal_config=self.multimodal_config, num_hidden_layers_override=num_hidden_layers, prefix=prefix, ) diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index 27b26b532..869a9a458 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -36,7 +36,7 @@ import torch.nn.functional as F from einops import rearrange from transformers import BatchFeature -from vllm.config import MultiModalConfig, VllmConfig +from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils @@ -119,7 +119,6 @@ class Ernie4_5_VisionAttention(nn.Module): num_heads: int, projection_size: int, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -153,7 +152,6 @@ class Ernie4_5_VisionAttention(nn.Module): num_heads=self.num_attention_heads_per_partition, head_size=self.hidden_size_per_attention_head, scale=self.hidden_size_per_attention_head**-0.5, - multimodal_config=multimodal_config, prefix=f"{prefix}.attn", ) @@ -266,7 +264,6 @@ class Ernie4_5_VisionBlock(nn.Module): act_layer: type[nn.Module] = QuickGELU, norm_layer: Callable[[int], nn.Module] | None = None, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -282,7 +279,6 @@ class Ernie4_5_VisionBlock(nn.Module): num_heads=num_heads, projection_size=dim, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.attn", ) @@ -357,7 +353,6 @@ class Ernie4_5_VisionTransformer(nn.Module): vision_config, norm_eps: float = 1e-6, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -393,7 +388,6 @@ class Ernie4_5_VisionTransformer(nn.Module): mlp_ratio=mlp_ratio, norm_layer=norm_layer, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.blocks.{layer_idx}", ) for layer_idx in range(depth) @@ -405,13 +399,9 @@ class Ernie4_5_VisionTransformer(nn.Module): ) self.ln = nn.LayerNorm(hidden_size, eps=1e-6) - attn_backend_override = ( - multimodal_config.mm_encoder_attn_backend if multimodal_config else None - ) self.attn_backend = get_vit_attn_backend( head_size=head_dim, dtype=torch.get_default_dtype(), - attn_backend_override=attn_backend_override, ) @property @@ -1308,7 +1298,6 @@ class Ernie4_5_VLMoeForConditionalGeneration( config.vision_config, norm_eps=getattr(config, "rms_norm_eps", 1e-6), quant_config=quant_config, - multimodal_config=multimodal_config, prefix=maybe_prefix(prefix, "vision_model"), ) self.resampler_model = VariableResolutionResamplerModel( diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index acb52f8d6..5db7a18f6 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -46,7 +46,7 @@ from transformers.models.glm4v.image_processing_glm4v import ( from transformers.models.glm4v.video_processing_glm4v import Glm4vVideoProcessor from transformers.video_utils import VideoMetadata -from vllm.config import MultiModalConfig, VllmConfig +from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size, parallel_state from vllm.distributed import utils as dist_utils @@ -107,6 +107,7 @@ from .utils import ( ) from .vision import ( get_vit_attn_backend, + is_vit_use_data_parallel, run_dp_sharded_mrope_vision_model, ) @@ -196,15 +197,10 @@ class Glm4vVisionMLP(nn.Module): hidden_features: int, bias: bool = False, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ): super().__init__() - use_data_parallel = ( - multimodal_config.mm_encoder_tp_mode == "data" - if multimodal_config - else False - ) + use_data_parallel = is_vit_use_data_parallel() self.gate_up_proj = MergedColumnParallelLinear( input_size=in_features, output_sizes=[hidden_features] * 2, @@ -258,16 +254,11 @@ class Glm4vVisionAttention(nn.Module): num_heads: int, projection_size: int, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ) -> None: super().__init__() # Per attention head and per partition values. - use_data_parallel = ( - multimodal_config.mm_encoder_tp_mode == "data" - if multimodal_config - else False - ) + use_data_parallel = is_vit_use_data_parallel() self.tp_size = ( 1 if use_data_parallel else get_tensor_model_parallel_world_size() ) @@ -305,7 +296,6 @@ class Glm4vVisionAttention(nn.Module): num_heads=self.num_attention_heads_per_partition, head_size=self.hidden_size_per_attention_head, scale=self.hidden_size_per_attention_head**-0.5, - multimodal_config=multimodal_config, ) self.apply_rotary_emb = ApplyRotaryEmb(enforce_enable=True) @@ -373,7 +363,6 @@ class Glm4vVisionBlock(nn.Module): mlp_hidden_dim: int, norm_layer: Callable[[int], nn.Module] | None = None, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -386,7 +375,6 @@ class Glm4vVisionBlock(nn.Module): num_heads=num_heads, projection_size=dim, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.attn", ) self.mlp = Glm4vVisionMLP( @@ -394,7 +382,6 @@ class Glm4vVisionBlock(nn.Module): mlp_hidden_dim, bias=False, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.mlp", ) @@ -454,16 +441,11 @@ class Glm4vPatchMerger(nn.Module): d_model: int, context_dim: int, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, bias: bool = False, prefix: str = "", ) -> None: super().__init__() - use_data_parallel = ( - multimodal_config.mm_encoder_tp_mode == "data" - if multimodal_config - else False - ) + use_data_parallel = is_vit_use_data_parallel() self.hidden_size = d_model self.proj = ColumnParallelLinear( self.hidden_size, @@ -619,13 +601,10 @@ class Glm4vVisionTransformer(nn.Module): vision_config: Glm4vVisionConfig, norm_eps: float = 1e-6, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ) -> None: super().__init__() - assert multimodal_config is not None, "multimodal_config must be provided" - patch_size = vision_config.patch_size temporal_patch_size = vision_config.temporal_patch_size in_channels = vision_config.in_channels @@ -660,7 +639,6 @@ class Glm4vVisionTransformer(nn.Module): mlp_hidden_dim=vision_config.out_hidden_size, norm_layer=norm_layer, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.blocks.{layer_idx}", ) for layer_idx in range(depth) @@ -670,7 +648,6 @@ class Glm4vVisionTransformer(nn.Module): d_model=vision_config.out_hidden_size, context_dim=vision_config.intermediate_size, quant_config=quant_config, - multimodal_config=multimodal_config, bias=False, prefix=f"{prefix}.merger", ) @@ -692,7 +669,6 @@ class Glm4vVisionTransformer(nn.Module): self.attn_backend = get_vit_attn_backend( head_size=head_dim, dtype=torch.get_default_dtype(), - attn_backend_override=multimodal_config.mm_encoder_attn_backend, ) @property @@ -1439,7 +1415,6 @@ class Glm4vForConditionalGeneration( config.vision_config, norm_eps=getattr(config, "rms_norm_eps", 1e-5), quant_config=quant_config, - multimodal_config=multimodal_config, prefix=maybe_prefix(prefix, "visual"), ) diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py index ba79ba66c..a4e309ee9 100644 --- a/vllm/model_executor/models/hunyuan_vision.py +++ b/vllm/model_executor/models/hunyuan_vision.py @@ -33,7 +33,7 @@ import torch.nn as nn import torch.nn.functional as F from transformers import BatchFeature -from vllm.config import MultiModalConfig, VllmConfig +from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils @@ -80,7 +80,6 @@ from vllm.transformers_utils.configs.hunyuan_vl import ( from vllm.transformers_utils.processors.hunyuan_vl import HunYuanVLProcessor from vllm.transformers_utils.processors.hunyuan_vl_image import smart_resize from vllm.utils.tensor_schema import TensorSchema, TensorShape -from vllm.v1.attention.backends.registry import AttentionBackendEnum from .interfaces import ( MultiModalEmbeddings, @@ -96,6 +95,7 @@ from .utils import ( init_vllm_registered_model, maybe_prefix, ) +from .vision import is_vit_use_data_parallel logger = init_logger(__name__) @@ -160,9 +160,9 @@ class HunYuanVisionMLP(nn.Module): act_fn: Callable[[torch.Tensor], torch.Tensor] = F.gelu, quant_config: QuantizationConfig | None = None, prefix: str = "", - use_data_parallel: bool = False, ): super().__init__() + use_data_parallel = is_vit_use_data_parallel() self.dense_h_to_4h = ColumnParallelLinear( in_features, hidden_features, @@ -194,12 +194,11 @@ class HunYuanVisionAttention(nn.Module): num_heads: int, projection_size: int, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", - use_data_parallel: bool = False, ) -> None: super().__init__() # Per attention head and per partition values. + use_data_parallel = is_vit_use_data_parallel() self.tp_size = ( 1 if use_data_parallel @@ -237,7 +236,6 @@ class HunYuanVisionAttention(nn.Module): self.hidden_size_per_attention_head, self.scale, prefix=f"{prefix}.attn", - multimodal_config=multimodal_config, ) def forward( @@ -260,9 +258,7 @@ class HunYuanVisionBlock(nn.Module): act_fn: Callable[[torch.Tensor], torch.Tensor] = F.gelu, norm_layer: Callable[[int], nn.Module] | None = None, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", - use_data_parallel: bool = False, ) -> None: super().__init__() if norm_layer is None: @@ -274,9 +270,7 @@ class HunYuanVisionBlock(nn.Module): num_heads=num_heads, projection_size=dim, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.self_attn", - use_data_parallel=use_data_parallel, ) self.mlp = HunYuanVisionMLP( dim, @@ -285,7 +279,6 @@ class HunYuanVisionBlock(nn.Module): bias=True, quant_config=quant_config, prefix=f"{prefix}.mlp", - use_data_parallel=use_data_parallel, ) def forward( @@ -439,9 +432,6 @@ class HunYuanVisionTransformer(nn.Module): vision_config: HunYuanVLVisionConfig, quant_config: QuantizationConfig | None = None, prefix: str = "", - use_data_parallel: bool = False, - multimodal_config: MultiModalConfig | None = None, - attn_backend_override: AttentionBackendEnum | None = None, ) -> None: super().__init__() @@ -467,9 +457,7 @@ class HunYuanVisionTransformer(nn.Module): act_fn=get_act_fn(vision_config.hidden_act), norm_layer=norm_layer, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.layers.{layer_idx}", - use_data_parallel=use_data_parallel, ) for layer_idx in range(num_hidden_layers) ] @@ -872,23 +860,14 @@ class HunYuanVLForConditionalGeneration( def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config: HunYuanVLConfig = vllm_config.model_config.hf_config - multimodal_config = vllm_config.model_config.multimodal_config self.config = config - self.multimodal_config = multimodal_config with self._mark_tower_model(vllm_config, {"image"}): - attn_backend_override = ( - multimodal_config.mm_encoder_attn_backend - if multimodal_config is not None - else None - ) self.visual = HunYuanVisionTransformer( config.vision_config, - quant_config=self.quant_config, + quant_config=vllm_config.quant_config, prefix=maybe_prefix(prefix, "visual"), - multimodal_config=multimodal_config, - attn_backend_override=attn_backend_override, ) with self._mark_language_model(vllm_config): diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py index 215402d0d..062ad2eb3 100644 --- a/vllm/model_executor/models/hyperclovax_vision.py +++ b/vllm/model_executor/models/hyperclovax_vision.py @@ -17,7 +17,7 @@ from timm.models.regnet import RegStage from transformers import BatchFeature, CLIPVisionConfig, SiglipVisionConfig from vllm.config import VllmConfig -from vllm.config.multimodal import BaseDummyOptions, MultiModalConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.cache import BaseMultiModalProcessorCache @@ -360,7 +360,6 @@ def _build_hcxvision_hf_processor( def init_vision_tower_for_hcxvision( vision_config, quant_config: QuantizationConfig | None, - multimodal_config: MultiModalConfig | None, *, use_nth_layer: int | None = None, require_post_norm: bool | None = None, @@ -378,7 +377,6 @@ def init_vision_tower_for_hcxvision( return CLIPVisionModel( vision_config, quant_config=quant_config, - multimodal_config=multimodal_config, num_hidden_layers_override=num_hidden_layers, require_post_norm=require_post_norm, prefix=prefix, @@ -387,7 +385,6 @@ def init_vision_tower_for_hcxvision( return SiglipVisionModel( vision_config, quant_config=quant_config, - multimodal_config=multimodal_config, num_hidden_layers_override=num_hidden_layers, require_post_norm=require_post_norm, prefix=prefix, @@ -605,7 +602,6 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): # init configs config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config - multimodal_config = vllm_config.model_config.multimodal_config # text_config text_config = config.text_config if text_config.model_type in ["gpt2", "hyperclovax", "llama"]: @@ -628,7 +624,6 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): self.vision_model = init_vision_tower_for_hcxvision( vision_config, quant_config=quant_config, - multimodal_config=multimodal_config, use_nth_layer=getattr(config, "use_nth_layer", -1), require_post_norm=False, prefix=maybe_prefix(prefix, "vision_model"), diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py index a69b2226d..382982805 100644 --- a/vllm/model_executor/models/isaac.py +++ b/vllm/model_executor/models/isaac.py @@ -16,7 +16,7 @@ from transformers.image_processing_utils import BatchFeature from transformers.tokenization_utils import TensorType from typing_extensions import TypedDict, Unpack -from vllm.config import MultiModalConfig, VllmConfig +from vllm.config import VllmConfig from vllm.config.model import ModelConfig from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils @@ -72,6 +72,8 @@ from vllm.transformers_utils.configs import ( ) from vllm.utils.tensor_schema import TensorSchema, TensorShape +from .vision import is_vit_use_data_parallel + def create_cumulative_seq_lengths( seq_sizes: torch.Tensor, device: torch.device @@ -942,15 +944,10 @@ class Siglip2VisionAttention(nn.Module): quant_config: QuantizationConfig | None = None, *, prefix: str = "", - multimodal_config: MultiModalConfig | None = None, ) -> None: super().__init__() - use_data_parallel = ( - multimodal_config.mm_encoder_tp_mode == "data" - if multimodal_config - else False - ) + use_data_parallel = is_vit_use_data_parallel() self.tp_size = ( 1 if use_data_parallel @@ -987,7 +984,6 @@ class Siglip2VisionAttention(nn.Module): head_size=self.hidden_size_per_attention_head, scale=self.hidden_size_per_attention_head**-0.5, prefix=f"{prefix}.attn", - multimodal_config=multimodal_config, ) def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]: @@ -1038,7 +1034,6 @@ class Siglip2EncoderLayer(nn.Module): quant_config: QuantizationConfig | None = None, *, prefix: str = "", - multimodal_config: MultiModalConfig | None = None, ) -> None: super().__init__() self.embed_dim = config.hidden_size @@ -1047,7 +1042,6 @@ class Siglip2EncoderLayer(nn.Module): config, quant_config=quant_config, prefix=f"{prefix}.self_attn", - multimodal_config=multimodal_config, ) self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) self.mlp = SiglipMLP( @@ -1088,7 +1082,6 @@ class Siglip2Encoder(nn.Module): quant_config: QuantizationConfig | None = None, *, prefix: str = "", - multimodal_config: MultiModalConfig | None = None, ) -> None: super().__init__() self.config = config @@ -1098,7 +1091,6 @@ class Siglip2Encoder(nn.Module): config, quant_config=quant_config, prefix=f"{prefix}.layers.{layer_idx}", - multimodal_config=multimodal_config, ) for layer_idx in range(config.num_hidden_layers) ] @@ -1127,7 +1119,6 @@ class Siglip2VisionTransformer(nn.Module): config: PixelShuffleSiglip2VisionConfig, quant_config: QuantizationConfig | None = None, prefix: str = "", - multimodal_config: MultiModalConfig | None = None, ): super().__init__() self.config = config @@ -1140,7 +1131,6 @@ class Siglip2VisionTransformer(nn.Module): config, quant_config=quant_config, prefix=f"{prefix}.encoder", - multimodal_config=multimodal_config, ) self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) @@ -1221,14 +1211,12 @@ class IsaacVisionEmbedding(nn.Module): hidden_dim: int, output_dim: int, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ): super().__init__() self.transformer = Siglip2VisionTransformer( vision_cfg, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=maybe_prefix(prefix, "0"), ) self.linear_fc1 = ColumnParallelLinear( @@ -1309,7 +1297,6 @@ class IsaacForConditionalGeneration( config: IsaacConfig = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config self.config = config - self.multimodal_config = vllm_config.model_config.multimodal_config head_dim = config.head_dim calculated_mrope_section = [ @@ -1373,7 +1360,6 @@ class IsaacForConditionalGeneration( hidden_dim=hidden_dim, output_dim=config.hidden_size, quant_config=quant_config, - multimodal_config=self.multimodal_config, prefix=maybe_prefix(prefix, "vision_embedding"), ) diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index a24c60289..f4c22fa92 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -16,7 +16,7 @@ from transformers.feature_extraction_utils import BatchFeature from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling from transformers.utils import torch_int -from vllm.config import MultiModalConfig, VllmConfig +from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.logger import init_logger @@ -80,6 +80,7 @@ from .utils import ( is_pp_missing_parameter, maybe_prefix, ) +from .vision import is_vit_use_data_parallel logger = init_logger(__name__) @@ -358,7 +359,6 @@ class KeyeSiglipAttention(nn.Module): self, config: PretrainedConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ): super().__init__() @@ -366,7 +366,8 @@ class KeyeSiglipAttention(nn.Module): hidden_size = config.hidden_size self.hidden_size = config.hidden_size - tp_size = get_tensor_model_parallel_world_size() + use_data_parallel = is_vit_use_data_parallel() + tp_size = 1 if use_data_parallel else get_tensor_model_parallel_world_size() self.total_num_heads = config.num_attention_heads assert self.total_num_heads % tp_size == 0 self.num_heads = self.total_num_heads // tp_size @@ -403,7 +404,6 @@ class KeyeSiglipAttention(nn.Module): scale=self.scale, num_kv_heads=self.num_kv_heads, prefix=f"{prefix}.attn", - multimodal_config=multimodal_config, ) self.apply_rotary_emb = ApplyRotaryEmb( @@ -497,7 +497,6 @@ class KeyeSiglipEncoderLayer(nn.Module): self, config: PretrainedConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ): super().__init__() @@ -506,14 +505,12 @@ class KeyeSiglipEncoderLayer(nn.Module): self.self_attn = KeyeSiglipAttention( config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.self_attn", ) self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) self.mlp = SiglipMLP( config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.mlp", ) @@ -552,7 +549,6 @@ class KeyeSiglipEncoder(nn.Module): self, config: PretrainedConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ): super().__init__() @@ -565,7 +561,6 @@ class KeyeSiglipEncoder(nn.Module): KeyeSiglipEncoderLayer( config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.layers.{layer_idx}", ) for layer_idx in range(config.num_hidden_layers) @@ -647,7 +642,6 @@ class KeyeSiglipVisionTransformer(nn.Module): self, config: PretrainedConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ): super().__init__() @@ -658,7 +652,6 @@ class KeyeSiglipVisionTransformer(nn.Module): self.encoder = KeyeSiglipEncoder( config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.encoder", ) self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) @@ -730,7 +723,6 @@ class KeyeSiglipVisionModel(nn.Module): self, config: PretrainedConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ): super().__init__() @@ -738,7 +730,6 @@ class KeyeSiglipVisionModel(nn.Module): self.vision_model = KeyeSiglipVisionTransformer( config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.vision_model", ) self.quant_config = quant_config @@ -1275,16 +1266,13 @@ class BaseKeyeModule(nn.Module, SupportsMultiModal): super().__init__() config: PretrainedConfig = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config - multimodal_config = vllm_config.model_config.multimodal_config self.config = config - self.multimodal_config = multimodal_config with self._mark_tower_model(vllm_config, {"image", "video"}): self.visual = KeyeSiglipVisionModel( config.vision_config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=maybe_prefix(prefix, "visual"), ) self.mlp_AR = self._build_projector( diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py index c1d7a90f3..11cb2336d 100644 --- a/vllm/model_executor/models/kimi_vl.py +++ b/vllm/model_executor/models/kimi_vl.py @@ -317,7 +317,6 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): with self._mark_tower_model(vllm_config, "image"): self.vision_tower = MoonVitPretrainedModel( config.vision_config, - multimodal_config=model_config.multimodal_config, prefix=maybe_prefix(prefix, "vision_tower"), ) self.multi_modal_projector = KimiVLMultiModalProjector( diff --git a/vllm/model_executor/models/lfm2_siglip2.py b/vllm/model_executor/models/lfm2_siglip2.py index 439dba5da..960f119a5 100644 --- a/vllm/model_executor/models/lfm2_siglip2.py +++ b/vllm/model_executor/models/lfm2_siglip2.py @@ -11,7 +11,6 @@ from torch.nn import functional as F from transformers import Siglip2VisionConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import MultiModalConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention @@ -23,7 +22,7 @@ from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from .vision import should_torch_compile_mm_vit +from .vision import is_vit_use_data_parallel, should_torch_compile_mm_vit class Siglip2VisionEmbeddings(nn.Module): @@ -154,7 +153,6 @@ class Siglip2Attention(nn.Module): self, config: Siglip2VisionConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ): super().__init__() @@ -171,10 +169,7 @@ class Siglip2Attention(nn.Module): self.scale = self.head_dim**-0.5 self.dropout = config.attention_dropout - use_data_parallel = ( - multimodal_config is not None - and multimodal_config.mm_encoder_tp_mode == "data" - ) + use_data_parallel = is_vit_use_data_parallel() tp_size = 1 if use_data_parallel else get_tensor_model_parallel_world_size() assert self.num_heads % tp_size == 0 self.num_heads_per_partition = self.num_heads // tp_size @@ -199,7 +194,6 @@ class Siglip2Attention(nn.Module): head_size=self.head_dim, scale=self.scale, prefix=f"{prefix}.attn", - multimodal_config=multimodal_config, ) def forward( @@ -241,16 +235,12 @@ class Siglip2MLP(nn.Module): self, config: Siglip2VisionConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ): super().__init__() self.config = config self.activation_fn = get_act_fn(config.hidden_act) - use_data_parallel = ( - multimodal_config is not None - and multimodal_config.mm_encoder_tp_mode == "data" - ) + use_data_parallel = is_vit_use_data_parallel() self.fc1 = ColumnParallelLinear( config.hidden_size, config.intermediate_size, @@ -282,7 +272,6 @@ class Siglip2EncoderLayer(nn.Module): self, config: Siglip2VisionConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ): super().__init__() @@ -291,14 +280,12 @@ class Siglip2EncoderLayer(nn.Module): self.self_attn = Siglip2Attention( config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.self_attn", ) self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) self.mlp = Siglip2MLP( config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.mlp", ) @@ -344,7 +331,6 @@ class Siglip2Encoder(nn.Module): self, config: Siglip2VisionConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ): super().__init__() @@ -354,7 +340,6 @@ class Siglip2Encoder(nn.Module): Siglip2EncoderLayer( config=config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.layers.{idx}", ) for idx in range(config.num_hidden_layers) @@ -383,7 +368,6 @@ class Siglip2VisionTransformer(nn.Module): self, config: Siglip2VisionConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ): super().__init__() @@ -397,7 +381,6 @@ class Siglip2VisionTransformer(nn.Module): self.encoder = Siglip2Encoder( config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.encoder", ) num_hidden_layers = config.num_hidden_layers @@ -438,7 +421,6 @@ class Siglip2Model(torch.nn.Module): self, config: Siglip2VisionConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ): super().__init__() @@ -446,7 +428,6 @@ class Siglip2Model(torch.nn.Module): self.vision_model = Siglip2VisionTransformer( config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.vision_model", ) diff --git a/vllm/model_executor/models/lfm2_vl.py b/vllm/model_executor/models/lfm2_vl.py index f70675171..8febeb4c0 100644 --- a/vllm/model_executor/models/lfm2_vl.py +++ b/vllm/model_executor/models/lfm2_vl.py @@ -600,7 +600,6 @@ class Lfm2VLForConditionalGeneration( self.vision_tower = Siglip2Model( config=vision_config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=maybe_prefix(prefix, "vision_tower"), ) else: diff --git a/vllm/model_executor/models/lightonocr.py b/vllm/model_executor/models/lightonocr.py index 897fbfdad..f88fa3f1a 100644 --- a/vllm/model_executor/models/lightonocr.py +++ b/vllm/model_executor/models/lightonocr.py @@ -166,7 +166,6 @@ class LightOnOCRForConditionalGeneration(Mistral3ForConditionalGeneration): self.vision_tower = init_vision_tower_for_llava( config, quant_config=quant_config, - multimodal_config=multimodal_config, require_post_norm=False, prefix=maybe_prefix(prefix, "vision_tower"), ) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index b4e41e0a5..37cf301a2 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -19,7 +19,7 @@ from transformers.models.llava import LlavaProcessor from transformers.models.pixtral import PixtralProcessor from vllm.config import VllmConfig -from vllm.config.multimodal import BaseDummyOptions, MultiModalConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear from vllm.model_executor.layers.quantization import QuantizationConfig @@ -456,7 +456,6 @@ def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int: def init_vision_tower_for_llava( hf_config: LlavaLikeConfig, quant_config: QuantizationConfig | None, - multimodal_config: MultiModalConfig | None, *, require_post_norm: bool | None = None, prefix: str = "", @@ -470,7 +469,6 @@ def init_vision_tower_for_llava( return CLIPVisionModel( vision_config, quant_config=quant_config, - multimodal_config=multimodal_config, num_hidden_layers_override=num_hidden_layers, require_post_norm=require_post_norm, prefix=prefix, @@ -479,7 +477,6 @@ def init_vision_tower_for_llava( return SiglipVisionModel( vision_config, quant_config=quant_config, - multimodal_config=multimodal_config, num_hidden_layers_override=num_hidden_layers, require_post_norm=require_post_norm, prefix=prefix, @@ -488,7 +485,6 @@ def init_vision_tower_for_llava( return PixtralHFVisionModel( vision_config, quant_config=quant_config, - multimodal_config=multimodal_config, num_hidden_layers_override=num_hidden_layers, require_post_norm=require_post_norm, prefix=prefix, @@ -562,7 +558,6 @@ class LlavaForConditionalGeneration( self.vision_tower = init_vision_tower_for_llava( config, quant_config=quant_config, - multimodal_config=multimodal_config, require_post_norm=False, prefix=maybe_prefix(prefix, "vision_tower"), ) diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index d11869874..a6cc21d8c 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -272,7 +272,6 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP self.vision_tower = init_vision_tower_for_llava( config, quant_config=quant_config, - multimodal_config=multimodal_config, require_post_norm=False, prefix=maybe_prefix(prefix, "vision_tower"), ) diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index 3f552c957..837dc2ee6 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -332,7 +332,6 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, Supp self.vision_tower = init_vision_tower_for_llava( config, quant_config=quant_config, - multimodal_config=multimodal_config, require_post_norm=False, prefix=maybe_prefix(prefix, "vision_tower"), ) diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index c24fb928f..3b91ebe29 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -513,7 +513,6 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, Supp self.vision_tower = init_vision_tower_for_llava( config, quant_config=quant_config, - multimodal_config=multimodal_config, require_post_norm=False, prefix=maybe_prefix(prefix, "vision_tower"), ) diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py index 513c46265..b4bc1388a 100644 --- a/vllm/model_executor/models/minimax_vl_01.py +++ b/vllm/model_executor/models/minimax_vl_01.py @@ -205,7 +205,6 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal, Support self.vision_tower = init_vision_tower_for_llava( config, quant_config=quant_config, - multimodal_config=multimodal_config, require_post_norm=False, prefix=maybe_prefix(prefix, "vision_tower"), ) diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index 2123eff63..9b49f7b47 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -16,7 +16,7 @@ from transformers import ( from transformers.models.pixtral import PixtralProcessor from vllm.config import VllmConfig -from vllm.config.multimodal import BaseDummyOptions, MultiModalConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear @@ -382,7 +382,6 @@ def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int: def init_vision_tower_for_llava( hf_config: LlavaLikeConfig, quant_config: QuantizationConfig | None, - multimodal_config: MultiModalConfig | None, *, require_post_norm: bool | None = None, prefix: str = "", @@ -397,7 +396,6 @@ def init_vision_tower_for_llava( return PixtralHFVisionModel( vision_config, quant_config=quant_config, - multimodal_config=multimodal_config, num_hidden_layers_override=num_hidden_layers, require_post_norm=require_post_norm, prefix=prefix, @@ -461,7 +459,6 @@ class Mistral3ForConditionalGeneration( self.vision_tower = init_vision_tower_for_llava( config, quant_config=quant_config, - multimodal_config=multimodal_config, require_post_norm=False, prefix=maybe_prefix(prefix, "vision_tower"), ) diff --git a/vllm/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py index c675b2cd6..823a8c0e7 100644 --- a/vllm/model_executor/models/moonvit.py +++ b/vllm/model_executor/models/moonvit.py @@ -52,7 +52,6 @@ import torch.nn.functional as F from transformers.activations import ACT2FN from transformers.modeling_utils import PreTrainedModel -from vllm.config import MultiModalConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer @@ -62,6 +61,7 @@ from vllm.model_executor.layers.linear import ( RowParallelLinear, ) from vllm.model_executor.models.utils import maybe_prefix +from vllm.model_executor.models.vision import is_vit_use_data_parallel from vllm.platforms import current_platform from vllm.transformers_utils.configs.moonvit import MoonViTConfig @@ -308,11 +308,10 @@ class MLP2(nn.Module): activation, bias: bool = True, prefix: str = "", - use_data_parallel: bool = False, ): super().__init__() assert len(dims) == 3 - self.use_data_parallel = use_data_parallel + self.use_data_parallel = is_vit_use_data_parallel() self.fc0 = ColumnParallelLinear( dims[0], dims[1], @@ -343,17 +342,12 @@ class MoonVitEncoderLayer(nn.Module): hidden_dim: int, mlp_dim: int, prefix: str = "", - multimodal_config: MultiModalConfig | None = None, *, activation=F.gelu, attn_bias: bool = False, ): super().__init__() - self.use_data_parallel = ( - multimodal_config.mm_encoder_tp_mode == "data" - if multimodal_config - else False - ) + self.use_data_parallel = is_vit_use_data_parallel() self.num_heads = num_heads self.hidden_dim = hidden_dim @@ -369,7 +363,6 @@ class MoonVitEncoderLayer(nn.Module): [hidden_dim, mlp_dim, hidden_dim], activation, prefix=f"{prefix}.mlp", - use_data_parallel=self.use_data_parallel, ) self.wqkv = QKVParallelLinear( hidden_size=hidden_dim, @@ -391,7 +384,6 @@ class MoonVitEncoderLayer(nn.Module): num_heads=self.num_attention_heads_per_partition, head_size=self.hidden_size_per_attention_head, scale=self.hidden_size_per_attention_head**-0.5, - multimodal_config=multimodal_config, prefix=f"{prefix}.attn", ) @@ -469,7 +461,6 @@ class MoonVitEncoder(nn.Module): num_layers: int, block_cfg: dict, prefix: str = "", - multimodal_config: MultiModalConfig | None = None, ) -> None: super().__init__() @@ -479,7 +470,6 @@ class MoonVitEncoder(nn.Module): self.blocks = nn.ModuleList( [ MoonVitEncoderLayer( - multimodal_config=multimodal_config, prefix=f"{prefix}.blocks.{layer_idx}", **block_cfg, ) @@ -550,7 +540,6 @@ class MoonVitPretrainedModel(PreTrainedModel): def __init__( self, config: MoonViTConfig, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", *inputs, **kwargs, @@ -579,7 +568,6 @@ class MoonVitPretrainedModel(PreTrainedModel): "attn_bias": True, }, prefix=f"{prefix}.encoder", - multimodal_config=multimodal_config, ) def forward( diff --git a/vllm/model_executor/models/opencua.py b/vllm/model_executor/models/opencua.py index 60a1701d0..cc860c939 100644 --- a/vllm/model_executor/models/opencua.py +++ b/vllm/model_executor/models/opencua.py @@ -244,7 +244,6 @@ class OpenCUAForConditionalGeneration(Qwen2_5_VLForConditionalGeneration): vision_config=config.vision_config, norm_eps=getattr(config, "rms_norm_eps", 1e-6), quant_config=self.quant_config, - multimodal_config=self.multimodal_config, prefix=maybe_prefix(prefix, "visual"), ) diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py index 501116252..b2ba9b196 100644 --- a/vllm/model_executor/models/ovis2_5.py +++ b/vllm/model_executor/models/ovis2_5.py @@ -10,7 +10,7 @@ import torch import torch.nn as nn from transformers import BaseImageProcessor, BatchFeature, PretrainedConfig -from vllm.config import MultiModalConfig, VllmConfig +from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.quantization import QuantizationConfig @@ -103,7 +103,6 @@ class VisualTokenizer(torch.nn.Module): config: PretrainedConfig, visual_vocab_size: int, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ): super().__init__() @@ -111,7 +110,6 @@ class VisualTokenizer(torch.nn.Module): self.vit = self._init_backbone( config=config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.vit", ) # reserved tokens for INDICATOR_IDS @@ -130,7 +128,6 @@ class VisualTokenizer(torch.nn.Module): self, config: PretrainedConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: QuantizationConfig | None = None, prefix: str = "", ): model_type = config.model_type @@ -138,7 +135,6 @@ class VisualTokenizer(torch.nn.Module): return Siglip2NavitModel( config=config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=prefix, ) raise ValueError(f"Unsupported visual tokenizer model_type: {model_type}") @@ -464,7 +460,6 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP): super().__init__() config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config - multimodal_config = vllm_config.model_config.multimodal_config self.config: PretrainedConfig = config @@ -478,7 +473,6 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP): self.visual_tokenizer = VisualTokenizer( config=config.vit_config, visual_vocab_size=config.visual_vocab_size, - multimodal_config=multimodal_config, quant_config=quant_config, prefix=f"{prefix}.visual_tokenizer", ) diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py index 24749c7cf..5028468e3 100644 --- a/vllm/model_executor/models/paddleocr_vl.py +++ b/vllm/model_executor/models/paddleocr_vl.py @@ -30,7 +30,7 @@ from transformers.modeling_outputs import ( ) from transformers.utils import torch_int -from vllm.config import MultiModalConfig, VllmConfig +from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils @@ -532,7 +532,6 @@ class SiglipAttention(nn.Module): num_heads: int, projection_size: int, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -565,7 +564,6 @@ class SiglipAttention(nn.Module): num_heads=self.num_attention_heads_per_partition, head_size=self.hidden_size_per_attention_head, scale=self.hidden_size_per_attention_head**-0.5, - multimodal_config=multimodal_config, prefix=f"{prefix}.attn", ) self.apply_rotary_emb = ApplyRotaryEmb( @@ -662,7 +660,6 @@ class SiglipEncoderLayer(nn.Module): self, config: PretrainedConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ): super().__init__() @@ -673,14 +670,12 @@ class SiglipEncoderLayer(nn.Module): num_heads=config.num_attention_heads, projection_size=config.hidden_size, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.self_attn", ) self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) self.mlp = SiglipMLP( config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.mlp", ) @@ -718,7 +713,6 @@ class SiglipEncoder(nn.Module): self, config: PretrainedConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ): super().__init__() @@ -727,13 +721,9 @@ class SiglipEncoder(nn.Module): num_heads = config.num_attention_heads head_dim = embed_dim // num_heads - attn_backend_override = ( - multimodal_config.mm_encoder_attn_backend if multimodal_config else None - ) self.attn_backend = get_vit_attn_backend( head_size=head_dim, dtype=torch.get_default_dtype(), - attn_backend_override=attn_backend_override, ) if self.attn_backend not in { AttentionBackendEnum.FLASH_ATTN, @@ -748,7 +738,6 @@ class SiglipEncoder(nn.Module): SiglipEncoderLayer( config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.layers.{layer_idx}", ) for layer_idx in range(config.num_hidden_layers) @@ -830,7 +819,6 @@ class SiglipVisionTransformer(nn.Module): self, config: PretrainedConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ): super().__init__() @@ -841,7 +829,6 @@ class SiglipVisionTransformer(nn.Module): self.encoder = SiglipEncoder( config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.encoder", ) self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) @@ -880,7 +867,6 @@ class SiglipVisionModel(nn.Module): self, config, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ): super().__init__() @@ -888,7 +874,6 @@ class SiglipVisionModel(nn.Module): self.vision_model = SiglipVisionTransformer( config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.vision_model", ) self.quant_config = quant_config @@ -1010,16 +995,13 @@ class PaddleOCRVLForConditionalGeneration(nn.Module, SupportsMultiModal, Support super().__init__() config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config - multimodal_config = vllm_config.model_config.multimodal_config self.config = config - self.multimodal_config = multimodal_config with self._mark_tower_model(vllm_config, "image"): self.visual = SiglipVisionModel( config=config.vision_config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=maybe_prefix(prefix, "visual"), ) self.mlp_AR = Projector(config, config.vision_config) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 169182cc1..fc0f46dae 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -29,7 +29,7 @@ from transformers import ( ) from vllm.config import VllmConfig -from vllm.config.multimodal import BaseDummyOptions, MultiModalConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding @@ -96,7 +96,6 @@ CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig( def _init_img_processor( hf_config: PretrainedConfig, quant_config: QuantizationConfig | None, - multimodal_config: MultiModalConfig | None, prefix: str = "", ) -> CLIPVisionModel: clip_config = CLIP_VIT_LARGE_PATCH14_336_CONFIG @@ -111,7 +110,6 @@ def _init_img_processor( img_processor = CLIPVisionModel( clip_config, quant_config=quant_config, - multimodal_config=multimodal_config, num_hidden_layers_override=num_hidden_layers, prefix=prefix, ) @@ -170,7 +168,6 @@ class Phi3HDImageEmbedding(nn.Module): self, config: PretrainedConfig, quant_config: QuantizationConfig | None, - multimodal_config: MultiModalConfig | None, prefix: str = "", ) -> None: super().__init__() @@ -181,7 +178,6 @@ class Phi3HDImageEmbedding(nn.Module): self.img_processor = _init_img_processor( config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.img_processor", ) @@ -596,7 +592,6 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant) self.vision_embed_tokens = Phi3HDImageEmbedding( config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=maybe_prefix(prefix, "model.vision_embed_tokens"), ) diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index b767bc160..c8eef850c 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -28,7 +28,7 @@ from transformers.models.pixtral.modeling_pixtral import ( from transformers.tokenization_utils_base import TextInput from vllm.config import VllmConfig -from vllm.config.multimodal import BaseDummyOptions, MultiModalConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_and_mul_fn from vllm.model_executor.layers.conv import Conv2dLayer @@ -74,6 +74,7 @@ from .utils import init_vllm_registered_model, maybe_prefix from .vision import ( VisionEncoderInfo, VisionFeatureSelectStrategy, + is_vit_use_data_parallel, resolve_visual_encoder_outputs, ) @@ -1065,17 +1066,12 @@ class PixtralHFMLP(nn.Module): self, config: PixtralVisionConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, *, prefix: str = "", ) -> None: super().__init__() - use_data_parallel = ( - multimodal_config.mm_encoder_tp_mode == "data" - if multimodal_config - else False - ) + use_data_parallel = is_vit_use_data_parallel() assert config.intermediate_size is not None self.gate_up_proj = MergedColumnParallelLinear( @@ -1108,7 +1104,6 @@ class PixtralHFAttention(nn.Module): self, config: PixtralVisionConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, *, prefix: str = "", ) -> None: @@ -1120,11 +1115,7 @@ class PixtralHFAttention(nn.Module): self.head_dim = config.hidden_size // config.num_attention_heads assert self.total_num_heads * self.head_dim == config.hidden_size - use_data_parallel = ( - multimodal_config.mm_encoder_tp_mode == "data" - if multimodal_config - else False - ) + use_data_parallel = is_vit_use_data_parallel() self.qkv_proj = QKVParallelLinear( hidden_size=config.hidden_size, head_size=self.head_dim, @@ -1189,7 +1180,6 @@ class PixtralHFTransformerBlock(nn.Module): self, config: PixtralVisionConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, *, prefix: str = "", ) -> None: @@ -1199,13 +1189,11 @@ class PixtralHFTransformerBlock(nn.Module): self.attention = PixtralHFAttention( config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.attention", ) self.feed_forward = PixtralHFMLP( config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.feed_forward", ) self.ffn_norm = RMSNorm(config.hidden_size, eps=1e-5) @@ -1232,7 +1220,6 @@ class PixtralHFTransformer(nn.Module): self, config: PixtralVisionConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, *, num_hidden_layers_override: int | None = None, prefix: str = "", @@ -1249,7 +1236,6 @@ class PixtralHFTransformer(nn.Module): PixtralHFTransformerBlock( config=config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.layers.{layer_idx}", ) for layer_idx in range(num_hidden_layers) @@ -1281,7 +1267,6 @@ class PixtralHFVisionModel(nn.Module): self, config: PixtralVisionConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, *, num_hidden_layers_override: int | None = None, require_post_norm: bool | None = None, @@ -1302,7 +1287,6 @@ class PixtralHFVisionModel(nn.Module): self.transformer = PixtralHFTransformer( config, quant_config=quant_config, - multimodal_config=multimodal_config, num_hidden_layers_override=num_hidden_layers_override, prefix=f"{prefix}.transformer", ) diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index ccb318ef2..bc4a0ecdd 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -846,7 +846,6 @@ class Qwen2_5OmniThinkerForConditionalGeneration( norm_eps=getattr(thinker_config.text_config, "rms_norm_eps", 1e-6), quant_config=quant_config, prefix=maybe_prefix(prefix, "visual"), - multimodal_config=multimodal_config, ) with self._mark_language_model(vllm_config): diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 8ad16955c..0e99b050a 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -43,7 +43,7 @@ from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import ( ) from vllm.compilation.decorators import support_torch_compile -from vllm.config import MultiModalConfig, VllmConfig +from vllm.config import VllmConfig from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils from vllm.forward_context import set_forward_context @@ -109,6 +109,7 @@ from .utils import ( ) from .vision import ( get_vit_attn_backend, + is_vit_use_data_parallel, run_dp_sharded_mrope_vision_model, ) @@ -266,15 +267,10 @@ class Qwen2_5_VisionMLP(nn.Module): bias: bool = False, act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ): super().__init__() - use_data_parallel = ( - multimodal_config.mm_encoder_tp_mode == "data" - if multimodal_config - else False - ) + use_data_parallel = is_vit_use_data_parallel() self.gate_up_proj = MergedColumnParallelLinear( input_size=in_features, output_sizes=[hidden_features] * 2, # [gate_proj, up_proj] @@ -308,16 +304,11 @@ class Qwen2_5_VisionAttention(nn.Module): num_heads: int, projection_size: int, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ) -> None: super().__init__() # Per attention head and per partition values. - use_data_parallel = ( - multimodal_config.mm_encoder_tp_mode == "data" - if multimodal_config - else False - ) + use_data_parallel = is_vit_use_data_parallel() self.tp_size = ( 1 if use_data_parallel @@ -354,7 +345,6 @@ class Qwen2_5_VisionAttention(nn.Module): num_heads=self.num_attention_heads_per_partition, head_size=self.hidden_size_per_attention_head, scale=self.hidden_size_per_attention_head**-0.5, - multimodal_config=multimodal_config, ) self.apply_rotary_emb = ApplyRotaryEmb(enforce_enable=True) @@ -435,7 +425,6 @@ class Qwen2_5_VisionBlock(nn.Module): act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu, norm_layer: Callable[[int], nn.Module] | None = None, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -448,7 +437,6 @@ class Qwen2_5_VisionBlock(nn.Module): num_heads=num_heads, projection_size=dim, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.attn", ) self.mlp = Qwen2_5_VisionMLP( @@ -457,7 +445,6 @@ class Qwen2_5_VisionBlock(nn.Module): act_fn=act_fn, bias=True, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.mlp", ) @@ -530,15 +517,10 @@ class Qwen2_5_VisionPatchMerger(nn.Module): norm_layer: Callable[[int], nn.Module] | None = None, spatial_merge_size: int = 2, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ) -> None: super().__init__() - use_data_parallel = ( - multimodal_config.mm_encoder_tp_mode == "data" - if multimodal_config - else False - ) + use_data_parallel = is_vit_use_data_parallel() self.hidden_size = context_dim * (spatial_merge_size**2) if norm_layer is None: norm_layer = partial(nn.LayerNorm, eps=1e-6) @@ -579,7 +561,6 @@ class Qwen2_5_VisionTransformer(nn.Module): vision_config: Qwen2_5_VLVisionConfig, norm_eps: float = 1e-6, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -620,15 +601,9 @@ class Qwen2_5_VisionTransformer(nn.Module): rope_parameters={"partial_rotary_factor": 0.5}, ) - attn_backend_override = ( - multimodal_config.mm_encoder_attn_backend - if multimodal_config is not None - else None - ) self.attn_backend = get_vit_attn_backend( head_size=head_dim, dtype=torch.get_default_dtype(), - attn_backend_override=attn_backend_override, ) if self.attn_backend not in { @@ -650,7 +625,6 @@ class Qwen2_5_VisionTransformer(nn.Module): act_fn=get_act_and_mul_fn(vision_config.hidden_act), norm_layer=norm_layer, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.blocks.{layer_idx}", ) for layer_idx in range(depth) @@ -664,7 +638,6 @@ class Qwen2_5_VisionTransformer(nn.Module): norm_layer=norm_layer, spatial_merge_size=self.spatial_merge_size, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.merger", ) @@ -1152,7 +1125,6 @@ class Qwen2_5_VLForConditionalGeneration( norm_eps=getattr(config, "rms_norm_eps", 1e-6), quant_config=self.quant_config, prefix=maybe_prefix(prefix, "visual"), - multimodal_config=multimodal_config, ) with self._mark_language_model(vllm_config): diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index f546f94a2..ef5f6d1e4 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -43,7 +43,7 @@ from transformers.models.qwen2_vl.configuration_qwen2_vl import ( from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize from transformers.models.qwen2_vl.video_processing_qwen2_vl import Qwen2VLVideoProcessor -from vllm.config import MultiModalConfig, VllmConfig +from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import parallel_state, tensor_model_parallel_all_gather from vllm.distributed import utils as dist_utils @@ -106,6 +106,7 @@ from .utils import ( ) from .vision import ( get_vit_attn_backend, + is_vit_use_data_parallel, run_dp_sharded_mrope_vision_model, ) @@ -247,15 +248,10 @@ class Qwen2VisionMLP(nn.Module): hidden_features: int, act_layer: type[nn.Module] = QuickGELU, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ): super().__init__() - use_data_parallel = ( - multimodal_config.mm_encoder_tp_mode == "data" - if multimodal_config - else False - ) + use_data_parallel = is_vit_use_data_parallel() self.fc1 = ColumnParallelLinear( in_features, hidden_features, @@ -286,16 +282,11 @@ class Qwen2VisionAttention(nn.Module): num_heads: int, projection_size: int, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ) -> None: super().__init__() # Per attention head and per partition values. - use_data_parallel = ( - multimodal_config.mm_encoder_tp_mode == "data" - if multimodal_config - else False - ) + use_data_parallel = is_vit_use_data_parallel() self.tp_size = ( 1 if use_data_parallel @@ -328,7 +319,6 @@ class Qwen2VisionAttention(nn.Module): num_heads=self.num_attention_heads_per_partition, head_size=self.hidden_size_per_attention_head, scale=self.hidden_size_per_attention_head**-0.5, - multimodal_config=multimodal_config, ) self.apply_rotary_emb = ApplyRotaryEmb(enforce_enable=True) @@ -409,7 +399,6 @@ class Qwen2VisionBlock(nn.Module): act_layer: type[nn.Module] = QuickGELU, norm_layer: Callable[[int], nn.Module] | None = None, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -424,7 +413,6 @@ class Qwen2VisionBlock(nn.Module): num_heads=num_heads, projection_size=dim, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.attn", ) self.mlp = Qwen2VisionMLP( @@ -432,7 +420,6 @@ class Qwen2VisionBlock(nn.Module): mlp_hidden_dim, act_layer=act_layer, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.mlp", ) @@ -493,15 +480,10 @@ class Qwen2VisionPatchMerger(nn.Module): norm_layer: Callable[[int], nn.Module] | None = None, spatial_merge_size: int = 2, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ) -> None: super().__init__() - use_data_parallel = ( - multimodal_config.mm_encoder_tp_mode == "data" - if multimodal_config - else False - ) + use_data_parallel = is_vit_use_data_parallel() self.hidden_size = context_dim * (spatial_merge_size**2) if norm_layer is None: norm_layer = partial(nn.LayerNorm, eps=1e-6) @@ -545,7 +527,6 @@ class Qwen2VisionTransformer(nn.Module): vision_config: Qwen2VLVisionConfig, norm_eps: float = 1e-6, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -560,11 +541,7 @@ class Qwen2VisionTransformer(nn.Module): num_heads = vision_config.num_heads mlp_ratio = vision_config.mlp_ratio - self.use_data_parallel = ( - multimodal_config.mm_encoder_tp_mode == "data" - if multimodal_config - else False - ) + self.use_data_parallel = is_vit_use_data_parallel() self.out_hidden_size = vision_config.hidden_size self.spatial_merge_size = spatial_merge_size @@ -596,7 +573,6 @@ class Qwen2VisionTransformer(nn.Module): norm_layer=norm_layer, quant_config=quant_config, prefix=f"{prefix}.blocks.{layer_idx}", - multimodal_config=multimodal_config, ) for layer_idx in range(depth) ] @@ -607,15 +583,10 @@ class Qwen2VisionTransformer(nn.Module): norm_layer=norm_layer, quant_config=quant_config, prefix=f"{prefix}.merger", - multimodal_config=multimodal_config, - ) - attn_backend_override = ( - multimodal_config.mm_encoder_attn_backend if multimodal_config else None ) self.attn_backend = get_vit_attn_backend( head_size=head_dim, dtype=torch.get_default_dtype(), - attn_backend_override=attn_backend_override, ) @property @@ -1238,7 +1209,6 @@ class Qwen2VLForConditionalGeneration( config.vision_config, norm_eps=getattr(config, "rms_norm_eps", 1e-6), quant_config=quant_config, - multimodal_config=multimodal_config, prefix=maybe_prefix(prefix, "visual"), ) diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index 5ff77c5f5..3b248d1b1 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -48,7 +48,7 @@ from transformers import __version__ as TRANSFORMERS_VERSION # isort: on from vllm.compilation.decorators import support_torch_compile -from vllm.config import MultiModalConfig, VllmConfig +from vllm.config import VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY @@ -160,7 +160,6 @@ class Qwen3OmniMoeAudioAttention(nn.Module): def __init__( self, config: Qwen3OmniMoeAudioEncoderConfig, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ): super().__init__() @@ -198,7 +197,6 @@ class Qwen3OmniMoeAudioAttention(nn.Module): num_heads=self.num_local_heads, head_size=self.head_dim, scale=self.scaling, - multimodal_config=multimodal_config, ) def forward( @@ -233,13 +231,12 @@ class Qwen3OmniMoeAudioEncoderLayer(nn.Module): def __init__( self, config: Qwen3OmniMoeAudioEncoderConfig, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ): super().__init__() self.embed_dim = config.d_model self.self_attn = Qwen3OmniMoeAudioAttention( - config, multimodal_config=multimodal_config, prefix=f"{prefix}.self_attn" + config, prefix=f"{prefix}.self_attn" ) self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) self.activation_fn = _ACTIVATION_REGISTRY[config.activation_function] @@ -301,7 +298,6 @@ class Qwen3OmniMoeAudioEncoder(nn.Module): def __init__( self, config: Qwen3OmniMoeAudioEncoderConfig, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ): super().__init__() @@ -345,7 +341,6 @@ class Qwen3OmniMoeAudioEncoder(nn.Module): [ Qwen3OmniMoeAudioEncoderLayer( config, - multimodal_config=multimodal_config, prefix=f"{prefix}.layers.{i}", ) for i in range(config.encoder_layers) @@ -359,15 +354,9 @@ class Qwen3OmniMoeAudioEncoder(nn.Module): self.proj2 = nn.Linear(config.d_model, config.output_dim) # Get attention backend - attn_backend_override = ( - multimodal_config.mm_encoder_attn_backend - if multimodal_config is not None - else None - ) self.attn_backend = get_vit_attn_backend( head_size=config.d_model // config.encoder_attention_heads, dtype=torch.get_default_dtype(), - attn_backend_override=attn_backend_override, ) def compute_attn_mask_seqlen(self, cu_seqlens: torch.Tensor) -> torch.Tensor | None: @@ -601,7 +590,6 @@ class Qwen3_VisionBlock(nn.Module): mlp_hidden_dim: int, act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu, norm_layer: Callable[[int], nn.Module] | None = None, - multimodal_config: MultiModalConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", ) -> None: @@ -615,7 +603,6 @@ class Qwen3_VisionBlock(nn.Module): num_heads=num_heads, projection_size=dim, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.attn", ) self.mlp = Qwen3_VisionMLP( @@ -710,7 +697,6 @@ class Qwen3Omni_VisionTransformer(nn.Module): vision_config, norm_eps: float = 1e-6, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -758,7 +744,6 @@ class Qwen3Omni_VisionTransformer(nn.Module): act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act], norm_layer=norm_layer, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.blocks.{layer_idx}", ) for layer_idx in range(vision_config.depth) @@ -788,16 +773,9 @@ class Qwen3Omni_VisionTransformer(nn.Module): ] ) - attn_backend_override = ( - multimodal_config.mm_encoder_attn_backend - if multimodal_config is not None - else None - ) - self.attn_backend = get_vit_attn_backend( head_size=head_dim, dtype=torch.get_default_dtype(), - attn_backend_override=attn_backend_override, ) @property @@ -1617,7 +1595,6 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( with self._mark_tower_model(vllm_config, "audio"): self.audio_tower = Qwen3OmniMoeAudioEncoder( thinker_config.audio_config, - multimodal_config=multimodal_config, prefix=maybe_prefix(prefix, "audio_tower"), ) @@ -1638,7 +1615,6 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( norm_eps=getattr(thinker_config.text_config, "rms_norm_eps", 1e-6), quant_config=quant_config, prefix=maybe_prefix(prefix, "visual"), - multimodal_config=multimodal_config, ) # register buffer for deepstack diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index c9eddb6f4..ee9944fdf 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -49,7 +49,7 @@ from transformers.models.qwen3_vl.video_processing_qwen3_vl import ( from transformers.video_utils import VideoMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import MultiModalConfig, VllmConfig +from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions from vllm.distributed import get_pp_group from vllm.logger import init_logger @@ -123,6 +123,7 @@ from .utils import ( ) from .vision import ( get_vit_attn_backend, + is_vit_use_data_parallel, run_dp_sharded_mrope_vision_model, ) @@ -169,15 +170,10 @@ class Qwen3_VisionMLP(nn.Module): bias: bool = False, act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ): super().__init__() - use_data_parallel = ( - multimodal_config.mm_encoder_tp_mode == "data" - if multimodal_config - else False - ) + use_data_parallel = is_vit_use_data_parallel() self.linear_fc1 = ColumnParallelLinear( in_features, hidden_features, @@ -211,7 +207,6 @@ class Qwen3_VisionBlock(nn.Module): mlp_hidden_dim: int, act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu, norm_layer: Callable[[int], nn.Module] | None = None, - multimodal_config: MultiModalConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", ) -> None: @@ -225,7 +220,6 @@ class Qwen3_VisionBlock(nn.Module): num_heads=num_heads, projection_size=dim, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.attn", ) self.mlp = Qwen3_VisionMLP( @@ -234,7 +228,6 @@ class Qwen3_VisionBlock(nn.Module): act_fn=act_fn, bias=True, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.mlp", ) @@ -267,15 +260,10 @@ class Qwen3_VisionPatchMerger(nn.Module): spatial_merge_size: int = 2, use_postshuffle_norm: bool = False, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ) -> None: super().__init__() - use_data_parallel = ( - multimodal_config.mm_encoder_tp_mode == "data" - if multimodal_config - else False - ) + use_data_parallel = is_vit_use_data_parallel() self.hidden_size = context_dim * (spatial_merge_size**2) self.use_postshuffle_norm = use_postshuffle_norm @@ -321,7 +309,6 @@ class Qwen3_VisionTransformer(nn.Module): vision_config: Qwen3VLVisionConfig, norm_eps: float = 1e-6, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -365,7 +352,6 @@ class Qwen3_VisionTransformer(nn.Module): norm_layer=norm_layer, spatial_merge_size=self.spatial_merge_size, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.merger", ) @@ -378,20 +364,15 @@ class Qwen3_VisionTransformer(nn.Module): use_postshuffle_norm=True, norm_layer=norm_layer, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.deepstack_merger_list.{layer_idx}", ) for layer_idx in range(len(self.deepstack_visual_indexes)) ] ) - attn_backend_override = ( - multimodal_config.mm_encoder_attn_backend if multimodal_config else None - ) self.attn_backend = get_vit_attn_backend( head_size=head_dim, dtype=torch.get_default_dtype(), - attn_backend_override=attn_backend_override, ) if self.attn_backend not in { @@ -411,7 +392,6 @@ class Qwen3_VisionTransformer(nn.Module): act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act], norm_layer=norm_layer, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.blocks.{layer_idx}", ) for layer_idx in range(vision_config.depth) @@ -1291,7 +1271,6 @@ class Qwen3VLForConditionalGeneration( config.vision_config, norm_eps=getattr(config, "rms_norm_eps", 1e-6), quant_config=quant_config, - multimodal_config=multimodal_config, prefix=maybe_prefix(prefix, "visual"), ) diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py index 50b511dd2..efd965ad4 100644 --- a/vllm/model_executor/models/qwen3_vl_moe.py +++ b/vllm/model_executor/models/qwen3_vl_moe.py @@ -446,7 +446,6 @@ class Qwen3VLMoeForConditionalGeneration( config.vision_config, norm_eps=getattr(config, "rms_norm_eps", 1e-6), quant_config=quant_config, - multimodal_config=multimodal_config, prefix=maybe_prefix(prefix, "visual"), ) diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 0f7815936..9d4e76f1c 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -16,7 +16,7 @@ from transformers import ( ) from vllm.config import VllmConfig -from vllm.config.multimodal import BaseDummyOptions, MultiModalConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.attention.encoder_only_attention import ( @@ -64,6 +64,7 @@ from .vision import ( VisionFeatureSelectStrategy, VisionFeatureSelectStrategyStr, get_num_selected_vision_tokens, + is_vit_use_data_parallel, resolve_visual_encoder_outputs, ) @@ -356,7 +357,6 @@ class SiglipAttention(nn.Module): self, config: SiglipVisionConfig | SiglipTextConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, *, prefix: str = "", attn_cls: type[EncoderOnlyAttention] | type[MMEncoderAttention], @@ -376,11 +376,7 @@ class SiglipAttention(nn.Module): self.scale = self.head_dim**-0.5 - use_data_parallel = ( - multimodal_config.mm_encoder_tp_mode == "data" - if multimodal_config - else False - ) + use_data_parallel = is_vit_use_data_parallel() self.qkv_proj = QKVParallelLinear( hidden_size=self.embed_dim, head_size=self.head_dim, @@ -409,7 +405,6 @@ class SiglipAttention(nn.Module): self.head_dim, self.scale, prefix=f"{prefix}.attn", - multimodal_config=multimodal_config, ) else: self.attn = attn_cls( @@ -437,17 +432,12 @@ class SiglipMLP(nn.Module): self, config: SiglipVisionConfig | SiglipTextConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ) -> None: super().__init__() self.config = config - use_data_parallel = ( - multimodal_config.mm_encoder_tp_mode == "data" - if multimodal_config - else False - ) + use_data_parallel = is_vit_use_data_parallel() self.activation_fn = get_act_fn(config.hidden_act) # Special handling for BNB and torchao quantization @@ -487,7 +477,6 @@ class SiglipEncoderLayer(nn.Module): self, config: SiglipVisionConfig | SiglipTextConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, *, prefix: str = "", attn_cls: type[EncoderOnlyAttention] | type[MMEncoderAttention], @@ -499,7 +488,6 @@ class SiglipEncoderLayer(nn.Module): self.self_attn = SiglipAttention( config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.self_attn", attn_cls=attn_cls, ) @@ -507,7 +495,6 @@ class SiglipEncoderLayer(nn.Module): self.mlp = SiglipMLP( config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.mlp", ) self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) @@ -535,7 +522,6 @@ class SiglipEncoder(nn.Module): self, config: SiglipVisionConfig | SiglipTextConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, num_hidden_layers_override: int | None = None, *, prefix: str = "", @@ -555,7 +541,6 @@ class SiglipEncoder(nn.Module): SiglipEncoderLayer( config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.layers.{layer_idx}", attn_cls=attn_cls, ) @@ -660,7 +645,6 @@ class SiglipMultiheadAttentionPoolingHead(nn.Module): self, config: SiglipVisionConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -674,7 +658,6 @@ class SiglipMultiheadAttentionPoolingHead(nn.Module): self.mlp = SiglipMLP( config=config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.mlp", ) @@ -700,7 +683,6 @@ class SiglipVisionTransformer(nn.Module): self, config: SiglipVisionConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, *, num_hidden_layers_override: int | None = None, require_post_norm: bool | None = None, @@ -717,7 +699,6 @@ class SiglipVisionTransformer(nn.Module): self.encoder = SiglipEncoder( config, quant_config=quant_config, - multimodal_config=multimodal_config, num_hidden_layers_override=num_hidden_layers_override, prefix=f"{prefix}.encoder", attn_cls=MMEncoderAttention, @@ -756,7 +737,6 @@ class SiglipVisionTransformer(nn.Module): SiglipMultiheadAttentionPoolingHead( config=config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.head", ) if self.use_head @@ -870,7 +850,6 @@ class SiglipVisionModel(nn.Module): self, config: SiglipVisionConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, *, num_hidden_layers_override: int | None = None, require_post_norm: bool | None = None, @@ -883,7 +862,6 @@ class SiglipVisionModel(nn.Module): self.vision_model = SiglipVisionTransformer( config, quant_config=quant_config, - multimodal_config=multimodal_config, num_hidden_layers_override=num_hidden_layers_override, require_post_norm=require_post_norm, prefix=f"{prefix}.vision_model", @@ -1062,9 +1040,7 @@ class SiglipEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant): config: SiglipConfig = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config - multimodal_config = vllm_config.model_config.multimodal_config self.config = config - self.multimodal_config = multimodal_config if hasattr(config, "num_labels"): config.num_labels = 0 @@ -1087,7 +1063,6 @@ class SiglipEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant): self.vision_model = SiglipVisionTransformer( vision_config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=maybe_prefix(prefix, "vision_model"), use_head=None, # Allows potential pooling head ) diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py index f4b79da5c..0b81d1b00 100644 --- a/vllm/model_executor/models/siglip2navit.py +++ b/vllm/model_executor/models/siglip2navit.py @@ -11,7 +11,6 @@ from torch.nn import functional as F from transformers import Siglip2VisionConfig from transformers.configuration_utils import PretrainedConfig -from vllm.config import MultiModalConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention @@ -30,6 +29,8 @@ from vllm.model_executor.layers.rotary_embedding.common import ( from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.platforms import current_platform +from .vision import is_vit_use_data_parallel + class VisionRotaryEmbedding(nn.Module): def __init__(self, dim: int, theta: float = 10000.0) -> None: @@ -178,9 +179,7 @@ class Siglip2Attention(nn.Module): self, config: Siglip2VisionConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", - use_data_parallel: bool = False, ): super().__init__() self.config = config @@ -196,11 +195,7 @@ class Siglip2Attention(nn.Module): self.scale = self.head_dim**-0.5 self.dropout = config.attention_dropout - use_data_parallel = ( - multimodal_config.mm_encoder_tp_mode == "data" - if multimodal_config - else False - ) + use_data_parallel = is_vit_use_data_parallel() self.qkv_proj = QKVParallelLinear( hidden_size=self.embed_dim, head_size=self.head_dim, @@ -228,7 +223,6 @@ class Siglip2Attention(nn.Module): head_size=self.head_dim, scale=self.scale, prefix=f"{prefix}.attn", - multimodal_config=multimodal_config, ) self.apply_rotary_emb = ApplyRotaryEmb( @@ -287,16 +281,11 @@ class Siglip2MLP(nn.Module): self, config: Siglip2VisionConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ): super().__init__() self.config = config - use_data_parallel = ( - multimodal_config.mm_encoder_tp_mode == "data" - if multimodal_config - else False - ) + use_data_parallel = is_vit_use_data_parallel() self.activation_fn = get_act_fn(config.hidden_act) self.fc1 = ColumnParallelLinear( config.hidden_size, @@ -325,7 +314,6 @@ class Siglip2EncoderLayer(nn.Module): self, config: Siglip2VisionConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ): super().__init__() @@ -334,14 +322,12 @@ class Siglip2EncoderLayer(nn.Module): self.self_attn = Siglip2Attention( config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.self_attn", ) self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) self.mlp = Siglip2MLP( config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.mlp", ) @@ -387,7 +373,6 @@ class Siglip2Encoder(nn.Module): self, config: Siglip2VisionConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ): super().__init__() @@ -397,7 +382,6 @@ class Siglip2Encoder(nn.Module): Siglip2EncoderLayer( config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.layers.{idx}", ) for idx in range(config.num_hidden_layers) @@ -571,7 +555,6 @@ class Siglip2VisionTransformer(nn.Module): self, config: Siglip2VisionConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ): super().__init__() @@ -582,7 +565,6 @@ class Siglip2VisionTransformer(nn.Module): self.encoder = Siglip2Encoder( config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.encoder", ) self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) @@ -610,7 +592,6 @@ class Siglip2NavitModel(torch.nn.Module): self, config: Siglip2VisionConfig, quant_config: QuantizationConfig | None = None, - multimodal_config: MultiModalConfig | None = None, prefix: str = "", ): super().__init__() @@ -618,7 +599,6 @@ class Siglip2NavitModel(torch.nn.Module): self.vision_model = Siglip2VisionTransformer( config, quant_config=quant_config, - multimodal_config=multimodal_config, prefix=f"{prefix}.vision_model", ) diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py index efa717fef..cc7ccc65b 100644 --- a/vllm/model_executor/models/tarsier.py +++ b/vllm/model_executor/models/tarsier.py @@ -19,7 +19,7 @@ from transformers.models.llava import LlavaProcessor from transformers.processing_utils import ProcessingKwargs, Unpack from transformers.tokenization_utils_base import PreTokenizedInput, TextInput -from vllm.config import MultiModalConfig, VllmConfig +from vllm.config import VllmConfig from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear from vllm.model_executor.layers.quantization import QuantizationConfig @@ -351,7 +351,6 @@ def _build_tarsier_hf_processor( def init_vision_tower_for_tarsier( hf_config: TarsierHfConfig, # Use the Tarsier specific config protocol quant_config: QuantizationConfig | None, - multimodal_config: MultiModalConfig | None, *, require_post_norm: bool | None = None, prefix: str = "", @@ -378,7 +377,6 @@ def init_vision_tower_for_tarsier( return CLIPVisionModel( vision_config, quant_config=quant_config, - multimodal_config=multimodal_config, num_hidden_layers_override=num_hidden_layers_to_init, require_post_norm=require_post_norm, prefix=prefix, @@ -387,7 +385,6 @@ def init_vision_tower_for_tarsier( return SiglipVisionModel( vision_config, quant_config=quant_config, - multimodal_config=multimodal_config, num_hidden_layers_override=num_hidden_layers_to_init, require_post_norm=require_post_norm, prefix=prefix, @@ -420,7 +417,6 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP) config: TarsierHfConfig = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config - multimodal_config = vllm_config.model_config.multimodal_config self.config = config # Storing the Tarsier-specific HF config @@ -428,7 +424,6 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP) self.vision_tower = init_vision_tower_for_tarsier( config, quant_config=quant_config, - multimodal_config=multimodal_config, require_post_norm=False, prefix=maybe_prefix(prefix, "vision_tower"), ) diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index f516a3d47..8be6aedcc 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -10,7 +10,7 @@ from typing import Final, Generic, Literal, Protocol, TypeAlias, TypeVar import torch from transformers import PretrainedConfig -from vllm.config import VllmConfig +from vllm.config import MultiModalConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -79,7 +79,7 @@ def get_vision_encoder_info(hf_config: VisionLanguageConfig) -> VisionEncoderInf raise NotImplementedError(msg) -def get_vit_attn_backend( +def _get_vit_attn_backend( head_size: int, dtype: torch.dtype, *, @@ -95,6 +95,52 @@ def get_vit_attn_backend( ) +def get_vit_attn_backend( + head_size: int, + dtype: torch.dtype, +) -> AttentionBackendEnum: + """ + Get the attention backend for Vision Transformer. + """ + try: + vllm_config: VllmConfig = get_current_vllm_config() + multimodal_config: MultiModalConfig | None = ( + vllm_config.model_config.multimodal_config + ) + except AssertionError: + multimodal_config = None + + attn_backend_override = ( + multimodal_config.mm_encoder_attn_backend + if multimodal_config is not None + else None + ) + attn_backend = _get_vit_attn_backend( + head_size, + dtype, + attn_backend_override=attn_backend_override, + ) + return attn_backend + + +def is_vit_use_data_parallel(): + """ + Get the tensor parallel type for Vision Transformer. + """ + try: + vllm_config: VllmConfig = get_current_vllm_config() + multimodal_config: MultiModalConfig | None = ( + vllm_config.model_config.multimodal_config + ) + except AssertionError: + multimodal_config = None + + mm_encoder_tp_mode = ( + multimodal_config.mm_encoder_tp_mode if multimodal_config is not None else None + ) + return mm_encoder_tp_mode == "data" + + def should_torch_compile_mm_vit(vllm_config: VllmConfig) -> bool: """Callable to be passed to `@support_torch_compile`'s `enable_if` argument.""" return vllm_config.compilation_config.compile_mm_encoder