[Models]: Make Multimodal config implicit in ViT implementation (#31972)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
@@ -16,7 +16,7 @@ from transformers.image_processing_utils import BatchFeature
|
||||
from transformers.tokenization_utils import TensorType
|
||||
from typing_extensions import TypedDict, Unpack
|
||||
|
||||
from vllm.config import MultiModalConfig, VllmConfig
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config.model import ModelConfig
|
||||
from vllm.distributed import parallel_state
|
||||
from vllm.distributed import utils as dist_utils
|
||||
@@ -72,6 +72,8 @@ from vllm.transformers_utils.configs import (
|
||||
)
|
||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||
|
||||
from .vision import is_vit_use_data_parallel
|
||||
|
||||
|
||||
def create_cumulative_seq_lengths(
|
||||
seq_sizes: torch.Tensor, device: torch.device
|
||||
@@ -942,15 +944,10 @@ class Siglip2VisionAttention(nn.Module):
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
*,
|
||||
prefix: str = "",
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
use_data_parallel = (
|
||||
multimodal_config.mm_encoder_tp_mode == "data"
|
||||
if multimodal_config
|
||||
else False
|
||||
)
|
||||
use_data_parallel = is_vit_use_data_parallel()
|
||||
self.tp_size = (
|
||||
1
|
||||
if use_data_parallel
|
||||
@@ -987,7 +984,6 @@ class Siglip2VisionAttention(nn.Module):
|
||||
head_size=self.hidden_size_per_attention_head,
|
||||
scale=self.hidden_size_per_attention_head**-0.5,
|
||||
prefix=f"{prefix}.attn",
|
||||
multimodal_config=multimodal_config,
|
||||
)
|
||||
|
||||
def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
|
||||
@@ -1038,7 +1034,6 @@ class Siglip2EncoderLayer(nn.Module):
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
*,
|
||||
prefix: str = "",
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.embed_dim = config.hidden_size
|
||||
@@ -1047,7 +1042,6 @@ class Siglip2EncoderLayer(nn.Module):
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.self_attn",
|
||||
multimodal_config=multimodal_config,
|
||||
)
|
||||
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
self.mlp = SiglipMLP(
|
||||
@@ -1088,7 +1082,6 @@ class Siglip2Encoder(nn.Module):
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
*,
|
||||
prefix: str = "",
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.config = config
|
||||
@@ -1098,7 +1091,6 @@ class Siglip2Encoder(nn.Module):
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.layers.{layer_idx}",
|
||||
multimodal_config=multimodal_config,
|
||||
)
|
||||
for layer_idx in range(config.num_hidden_layers)
|
||||
]
|
||||
@@ -1127,7 +1119,6 @@ class Siglip2VisionTransformer(nn.Module):
|
||||
config: PixelShuffleSiglip2VisionConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
prefix: str = "",
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
@@ -1140,7 +1131,6 @@ class Siglip2VisionTransformer(nn.Module):
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.encoder",
|
||||
multimodal_config=multimodal_config,
|
||||
)
|
||||
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
||||
|
||||
@@ -1221,14 +1211,12 @@ class IsaacVisionEmbedding(nn.Module):
|
||||
hidden_dim: int,
|
||||
output_dim: int,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
self.transformer = Siglip2VisionTransformer(
|
||||
vision_cfg,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=maybe_prefix(prefix, "0"),
|
||||
)
|
||||
self.linear_fc1 = ColumnParallelLinear(
|
||||
@@ -1309,7 +1297,6 @@ class IsaacForConditionalGeneration(
|
||||
config: IsaacConfig = vllm_config.model_config.hf_config
|
||||
quant_config = vllm_config.quant_config
|
||||
self.config = config
|
||||
self.multimodal_config = vllm_config.model_config.multimodal_config
|
||||
|
||||
head_dim = config.head_dim
|
||||
calculated_mrope_section = [
|
||||
@@ -1373,7 +1360,6 @@ class IsaacForConditionalGeneration(
|
||||
hidden_dim=hidden_dim,
|
||||
output_dim=config.hidden_size,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=self.multimodal_config,
|
||||
prefix=maybe_prefix(prefix, "vision_embedding"),
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user