[Models]: Make Multimodal config implicit in ViT implementation (#31972)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
Isotr0py
2026-01-24 20:34:26 +08:00
committed by GitHub
parent 6450b536a6
commit 9ad7f89f55
38 changed files with 118 additions and 470 deletions

View File

@@ -18,7 +18,6 @@ import torch.nn as nn
import torch.nn.functional as F
from transformers import CLIPVisionConfig
from vllm.config import MultiModalConfig
from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
from vllm.model_executor.layers.conv import Conv2dLayer
from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -609,7 +608,6 @@ class DeepCLIPVisionTransformer(nn.Module):
self,
config: CLIPVisionConfig,
quant_config: QuantizationConfig | None = None,
multimodal_config: MultiModalConfig | None = None,
*,
num_hidden_layers_override: int | None = None,
prefix: str = "",
@@ -628,7 +626,6 @@ class DeepCLIPVisionTransformer(nn.Module):
self.transformer = CLIPEncoder(
config=config,
quant_config=quant_config,
multimodal_config=multimodal_config,
num_hidden_layers_override=num_hidden_layers_override,
prefix=f"{prefix}.encoder",
attn_cls=MMEncoderAttention,