[Models]: Make Multimodal config implicit in ViT implementation (#31972)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
Isotr0py
2026-01-24 20:34:26 +08:00
committed by GitHub
parent 6450b536a6
commit 9ad7f89f55
38 changed files with 118 additions and 470 deletions

View File

@@ -30,7 +30,7 @@ from transformers.modeling_outputs import (
)
from transformers.utils import torch_int
from vllm.config import MultiModalConfig, VllmConfig
from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.distributed import parallel_state
from vllm.distributed import utils as dist_utils
@@ -532,7 +532,6 @@ class SiglipAttention(nn.Module):
num_heads: int,
projection_size: int,
quant_config: QuantizationConfig | None = None,
multimodal_config: MultiModalConfig | None = None,
prefix: str = "",
) -> None:
super().__init__()
@@ -565,7 +564,6 @@ class SiglipAttention(nn.Module):
num_heads=self.num_attention_heads_per_partition,
head_size=self.hidden_size_per_attention_head,
scale=self.hidden_size_per_attention_head**-0.5,
multimodal_config=multimodal_config,
prefix=f"{prefix}.attn",
)
self.apply_rotary_emb = ApplyRotaryEmb(
@@ -662,7 +660,6 @@ class SiglipEncoderLayer(nn.Module):
self,
config: PretrainedConfig,
quant_config: QuantizationConfig | None = None,
multimodal_config: MultiModalConfig | None = None,
prefix: str = "",
):
super().__init__()
@@ -673,14 +670,12 @@ class SiglipEncoderLayer(nn.Module):
num_heads=config.num_attention_heads,
projection_size=config.hidden_size,
quant_config=quant_config,
multimodal_config=multimodal_config,
prefix=f"{prefix}.self_attn",
)
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = SiglipMLP(
config,
quant_config=quant_config,
multimodal_config=multimodal_config,
prefix=f"{prefix}.mlp",
)
@@ -718,7 +713,6 @@ class SiglipEncoder(nn.Module):
self,
config: PretrainedConfig,
quant_config: QuantizationConfig | None = None,
multimodal_config: MultiModalConfig | None = None,
prefix: str = "",
):
super().__init__()
@@ -727,13 +721,9 @@ class SiglipEncoder(nn.Module):
num_heads = config.num_attention_heads
head_dim = embed_dim // num_heads
attn_backend_override = (
multimodal_config.mm_encoder_attn_backend if multimodal_config else None
)
self.attn_backend = get_vit_attn_backend(
head_size=head_dim,
dtype=torch.get_default_dtype(),
attn_backend_override=attn_backend_override,
)
if self.attn_backend not in {
AttentionBackendEnum.FLASH_ATTN,
@@ -748,7 +738,6 @@ class SiglipEncoder(nn.Module):
SiglipEncoderLayer(
config,
quant_config=quant_config,
multimodal_config=multimodal_config,
prefix=f"{prefix}.layers.{layer_idx}",
)
for layer_idx in range(config.num_hidden_layers)
@@ -830,7 +819,6 @@ class SiglipVisionTransformer(nn.Module):
self,
config: PretrainedConfig,
quant_config: QuantizationConfig | None = None,
multimodal_config: MultiModalConfig | None = None,
prefix: str = "",
):
super().__init__()
@@ -841,7 +829,6 @@ class SiglipVisionTransformer(nn.Module):
self.encoder = SiglipEncoder(
config,
quant_config=quant_config,
multimodal_config=multimodal_config,
prefix=f"{prefix}.encoder",
)
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
@@ -880,7 +867,6 @@ class SiglipVisionModel(nn.Module):
self,
config,
quant_config: QuantizationConfig | None = None,
multimodal_config: MultiModalConfig | None = None,
prefix: str = "",
):
super().__init__()
@@ -888,7 +874,6 @@ class SiglipVisionModel(nn.Module):
self.vision_model = SiglipVisionTransformer(
config,
quant_config=quant_config,
multimodal_config=multimodal_config,
prefix=f"{prefix}.vision_model",
)
self.quant_config = quant_config
@@ -1010,16 +995,13 @@ class PaddleOCRVLForConditionalGeneration(nn.Module, SupportsMultiModal, Support
super().__init__()
config = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config
multimodal_config = vllm_config.model_config.multimodal_config
self.config = config
self.multimodal_config = multimodal_config
with self._mark_tower_model(vllm_config, "image"):
self.visual = SiglipVisionModel(
config=config.vision_config,
quant_config=quant_config,
multimodal_config=multimodal_config,
prefix=maybe_prefix(prefix, "visual"),
)
self.mlp_AR = Projector(config, config.vision_config)