[Models]: Make Multimodal config implicit in ViT implementation (#31972)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
@@ -4,7 +4,6 @@
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.config import MultiModalConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.custom_op import CustomOp
|
||||
from vllm.model_executor.models.vision import get_vit_attn_backend
|
||||
@@ -32,7 +31,6 @@ class MMEncoderAttention(CustomOp):
|
||||
scale: float | None = None,
|
||||
num_kv_heads: int | None = None,
|
||||
prefix: str = "",
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Args:
|
||||
@@ -42,7 +40,6 @@ class MMEncoderAttention(CustomOp):
|
||||
num_kv_heads: number of kv heads.
|
||||
prefix: This has no effect, it is only here to make it easier to
|
||||
swap between Attention and MultiHeadAttention
|
||||
multimodal_config: configs for multi-modal.
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
@@ -62,16 +59,10 @@ class MMEncoderAttention(CustomOp):
|
||||
# weight and activation dtype.
|
||||
dtype = torch.get_default_dtype()
|
||||
|
||||
# Try to get vision attention backend from multimodal_config.
|
||||
attn_backend_override = None
|
||||
if multimodal_config is not None:
|
||||
attn_backend_override = multimodal_config.mm_encoder_attn_backend
|
||||
|
||||
# Get device-specific vision attention backend.
|
||||
self.attn_backend = get_vit_attn_backend(
|
||||
head_size=head_size,
|
||||
dtype=dtype,
|
||||
attn_backend_override=attn_backend_override,
|
||||
)
|
||||
|
||||
self.is_flash_attn_backend = self.attn_backend in {
|
||||
|
||||
@@ -16,7 +16,7 @@ from transformers import (
|
||||
|
||||
from vllm.attention.layer import Attention
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions, MultiModalConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions
|
||||
from vllm.distributed import divide, get_tensor_model_parallel_world_size
|
||||
from vllm.model_executor.layers.activation import get_act_fn
|
||||
from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
|
||||
@@ -59,6 +59,7 @@ from .vision import (
|
||||
VisionFeatureSelectStrategy,
|
||||
VisionFeatureSelectStrategyStr,
|
||||
get_num_selected_vision_tokens,
|
||||
is_vit_use_data_parallel,
|
||||
resolve_visual_encoder_outputs,
|
||||
)
|
||||
|
||||
@@ -353,7 +354,6 @@ class CLIPAttention(nn.Module):
|
||||
self,
|
||||
config: CLIPTextConfig | CLIPVisionConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
*,
|
||||
prefix: str = "",
|
||||
attn_cls: type[Attention] | type[MMEncoderAttention],
|
||||
@@ -372,11 +372,7 @@ class CLIPAttention(nn.Module):
|
||||
)
|
||||
self.scale = self.head_dim**-0.5
|
||||
|
||||
use_data_parallel = (
|
||||
multimodal_config.mm_encoder_tp_mode == "data"
|
||||
if multimodal_config
|
||||
else False
|
||||
)
|
||||
use_data_parallel = is_vit_use_data_parallel()
|
||||
self.qkv_proj = QKVParallelLinear(
|
||||
hidden_size=self.embed_dim,
|
||||
head_size=self.head_dim,
|
||||
@@ -405,7 +401,6 @@ class CLIPAttention(nn.Module):
|
||||
self.head_dim,
|
||||
self.scale,
|
||||
prefix=f"{prefix}.attn",
|
||||
multimodal_config=multimodal_config,
|
||||
)
|
||||
else:
|
||||
self.attn = attn_cls(
|
||||
@@ -434,17 +429,12 @@ class CLIPMLP(nn.Module):
|
||||
self,
|
||||
config: CLIPTextConfig | CLIPVisionConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.config = config
|
||||
use_data_parallel = (
|
||||
multimodal_config.mm_encoder_tp_mode == "data"
|
||||
if multimodal_config
|
||||
else False
|
||||
)
|
||||
use_data_parallel = is_vit_use_data_parallel()
|
||||
self.activation_fn = get_act_fn(config.hidden_act)
|
||||
|
||||
self.fc1 = ColumnParallelLinear(
|
||||
@@ -477,7 +467,6 @@ class CLIPEncoderLayer(nn.Module):
|
||||
self,
|
||||
config: CLIPTextConfig | CLIPVisionConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
*,
|
||||
prefix: str = "",
|
||||
attn_cls: type[Attention] | type[MMEncoderAttention],
|
||||
@@ -487,7 +476,6 @@ class CLIPEncoderLayer(nn.Module):
|
||||
self.self_attn = CLIPAttention(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.self_attn",
|
||||
attn_cls=attn_cls,
|
||||
)
|
||||
@@ -495,7 +483,6 @@ class CLIPEncoderLayer(nn.Module):
|
||||
self.mlp = CLIPMLP(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.mlp",
|
||||
)
|
||||
self.layer_norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
||||
@@ -528,7 +515,6 @@ class CLIPEncoder(nn.Module):
|
||||
self,
|
||||
config: CLIPTextConfig | CLIPVisionConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
num_hidden_layers_override: int | None = None,
|
||||
*,
|
||||
prefix: str = "",
|
||||
@@ -548,7 +534,6 @@ class CLIPEncoder(nn.Module):
|
||||
CLIPEncoderLayer(
|
||||
config=config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.layers.{layer_idx}",
|
||||
attn_cls=attn_cls,
|
||||
)
|
||||
@@ -658,7 +643,6 @@ class CLIPVisionTransformer(nn.Module):
|
||||
self,
|
||||
config: CLIPVisionConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
*,
|
||||
num_hidden_layers_override: int | None = None,
|
||||
require_post_norm: bool | None = None,
|
||||
@@ -678,7 +662,6 @@ class CLIPVisionTransformer(nn.Module):
|
||||
self.encoder = CLIPEncoder(
|
||||
config=config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
num_hidden_layers_override=num_hidden_layers_override,
|
||||
prefix=f"{prefix}.encoder",
|
||||
attn_cls=MMEncoderAttention,
|
||||
@@ -780,7 +763,6 @@ class CLIPVisionModel(nn.Module):
|
||||
self,
|
||||
config: CLIPVisionConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
*,
|
||||
num_hidden_layers_override: int | None = None,
|
||||
require_post_norm: bool | None = None,
|
||||
@@ -791,7 +773,6 @@ class CLIPVisionModel(nn.Module):
|
||||
self.vision_model = CLIPVisionTransformer(
|
||||
config=config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
num_hidden_layers_override=num_hidden_layers_override,
|
||||
require_post_norm=require_post_norm,
|
||||
prefix=f"{prefix}.vision_model",
|
||||
@@ -869,7 +850,6 @@ class CLIPEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
|
||||
self.vision_model = CLIPVisionTransformer(
|
||||
vision_config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=maybe_prefix(prefix, "vision_model"),
|
||||
)
|
||||
self.visual_projection = nn.Linear(
|
||||
|
||||
@@ -18,7 +18,6 @@ import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from transformers import CLIPVisionConfig
|
||||
|
||||
from vllm.config import MultiModalConfig
|
||||
from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
|
||||
from vllm.model_executor.layers.conv import Conv2dLayer
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
@@ -609,7 +608,6 @@ class DeepCLIPVisionTransformer(nn.Module):
|
||||
self,
|
||||
config: CLIPVisionConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
*,
|
||||
num_hidden_layers_override: int | None = None,
|
||||
prefix: str = "",
|
||||
@@ -628,7 +626,6 @@ class DeepCLIPVisionTransformer(nn.Module):
|
||||
self.transformer = CLIPEncoder(
|
||||
config=config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
num_hidden_layers_override=num_hidden_layers_override,
|
||||
prefix=f"{prefix}.encoder",
|
||||
attn_cls=MMEncoderAttention,
|
||||
|
||||
@@ -398,7 +398,6 @@ class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, Supports
|
||||
self.vision_model = DeepCLIPVisionTransformer(
|
||||
config=clip_vision_config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=maybe_prefix(prefix, "vision_model"),
|
||||
)
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ import torch.nn as nn
|
||||
from torch.nn import LayerNorm
|
||||
from transformers.models.qwen2_vl import Qwen2VLProcessor
|
||||
|
||||
from vllm.config import MultiModalConfig, VllmConfig
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions
|
||||
from vllm.distributed import utils as dist_utils
|
||||
from vllm.distributed.parallel_state import (
|
||||
@@ -60,7 +60,7 @@ from vllm.transformers_utils.configs.dotsocr import DotsOCRConfig, DotsVisionCon
|
||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||
from vllm.v1.attention.backends.registry import AttentionBackendEnum
|
||||
|
||||
from .vision import run_dp_sharded_mrope_vision_model
|
||||
from .vision import is_vit_use_data_parallel, run_dp_sharded_mrope_vision_model
|
||||
|
||||
IMAGE_TOKEN = "<|imgpad|>"
|
||||
|
||||
@@ -183,9 +183,9 @@ class PatchMerger(nn.Module):
|
||||
spatial_merge_size: int = 2,
|
||||
pre_norm="layernorm",
|
||||
prefix: str = "",
|
||||
use_data_parallel: bool = False,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
use_data_parallel = is_vit_use_data_parallel()
|
||||
self.hidden_size = context_dim * (spatial_merge_size**2)
|
||||
self.pre_norm = pre_norm
|
||||
if self.pre_norm == "layernorm":
|
||||
@@ -230,15 +230,10 @@ class DotsVisionAttention(nn.Module):
|
||||
bias: bool = True,
|
||||
*,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
use_data_parallel = (
|
||||
multimodal_config.mm_encoder_tp_mode == "data"
|
||||
if multimodal_config
|
||||
else False
|
||||
)
|
||||
use_data_parallel = is_vit_use_data_parallel()
|
||||
|
||||
self.embed_dim = dim
|
||||
self.tp_size = (
|
||||
@@ -272,7 +267,6 @@ class DotsVisionAttention(nn.Module):
|
||||
num_heads=self.num_attention_heads_per_partition,
|
||||
head_size=self.hidden_size_per_attention_head,
|
||||
scale=self.hidden_size_per_attention_head**-0.5,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.attn",
|
||||
)
|
||||
|
||||
@@ -329,7 +323,6 @@ class DotsSwiGLUFFN(nn.Module):
|
||||
config,
|
||||
*,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
@@ -337,11 +330,7 @@ class DotsSwiGLUFFN(nn.Module):
|
||||
in_features = config.embed_dim
|
||||
bias = config.use_bias
|
||||
|
||||
use_data_parallel = (
|
||||
multimodal_config.mm_encoder_tp_mode == "data"
|
||||
if multimodal_config
|
||||
else False
|
||||
)
|
||||
use_data_parallel = is_vit_use_data_parallel()
|
||||
# Referenced aimv2.py AIMv2SwiGLUFFN
|
||||
self.fc13 = MergedColumnParallelLinear(
|
||||
in_features,
|
||||
@@ -447,7 +436,6 @@ class DotsVisionBlock(nn.Module):
|
||||
config,
|
||||
*,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
@@ -458,14 +446,12 @@ class DotsVisionBlock(nn.Module):
|
||||
num_heads=config.num_attention_heads,
|
||||
bias=config.use_bias,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.attn",
|
||||
)
|
||||
self.norm1 = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
|
||||
self.mlp = DotsSwiGLUFFN(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.mlp",
|
||||
)
|
||||
self.norm2 = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
|
||||
@@ -493,7 +479,6 @@ class DotsVisionTransformer(nn.Module):
|
||||
self,
|
||||
config: DotsVisionConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
*,
|
||||
num_hidden_layers_override: int | None = None,
|
||||
require_post_norm: bool | None = None,
|
||||
@@ -507,15 +492,9 @@ class DotsVisionTransformer(nn.Module):
|
||||
|
||||
head_dim = config.embed_dim // config.num_attention_heads
|
||||
self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
|
||||
attn_backend_override = (
|
||||
multimodal_config.mm_encoder_attn_backend
|
||||
if multimodal_config is not None
|
||||
else None
|
||||
)
|
||||
self.attn_backend = get_vit_attn_backend(
|
||||
head_size=head_dim,
|
||||
dtype=torch.get_default_dtype(),
|
||||
attn_backend_override=attn_backend_override,
|
||||
)
|
||||
self.out_hidden_size = config.hidden_size
|
||||
# Keep blocks for compatibility with other vision towers
|
||||
@@ -529,7 +508,6 @@ class DotsVisionTransformer(nn.Module):
|
||||
DotsVisionBlock(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.blocks.{i}",
|
||||
)
|
||||
for i in range(num_layers)
|
||||
@@ -542,16 +520,10 @@ class DotsVisionTransformer(nn.Module):
|
||||
else:
|
||||
self.post_trunk_norm = None
|
||||
|
||||
use_data_parallel = (
|
||||
multimodal_config.mm_encoder_tp_mode == "data"
|
||||
if multimodal_config
|
||||
else False
|
||||
)
|
||||
self.merger = PatchMerger(
|
||||
dim=config.hidden_size,
|
||||
context_dim=config.embed_dim,
|
||||
spatial_merge_size=config.spatial_merge_size,
|
||||
use_data_parallel=use_data_parallel,
|
||||
)
|
||||
|
||||
@property
|
||||
@@ -693,7 +665,6 @@ class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA
|
||||
self.vision_tower = DotsVisionTransformer(
|
||||
vision_config,
|
||||
quant_config=self.quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=maybe_prefix(prefix, "vision_tower"),
|
||||
)
|
||||
|
||||
|
||||
@@ -270,7 +270,6 @@ class Eagle2_5_VLForConditionalGeneration(
|
||||
return SiglipVisionModel(
|
||||
vision_config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=self.multimodal_config,
|
||||
num_hidden_layers_override=num_hidden_layers,
|
||||
prefix=prefix,
|
||||
)
|
||||
|
||||
@@ -36,7 +36,7 @@ import torch.nn.functional as F
|
||||
from einops import rearrange
|
||||
from transformers import BatchFeature
|
||||
|
||||
from vllm.config import MultiModalConfig, VllmConfig
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
|
||||
from vllm.distributed import parallel_state
|
||||
from vllm.distributed import utils as dist_utils
|
||||
@@ -119,7 +119,6 @@ class Ernie4_5_VisionAttention(nn.Module):
|
||||
num_heads: int,
|
||||
projection_size: int,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
@@ -153,7 +152,6 @@ class Ernie4_5_VisionAttention(nn.Module):
|
||||
num_heads=self.num_attention_heads_per_partition,
|
||||
head_size=self.hidden_size_per_attention_head,
|
||||
scale=self.hidden_size_per_attention_head**-0.5,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.attn",
|
||||
)
|
||||
|
||||
@@ -266,7 +264,6 @@ class Ernie4_5_VisionBlock(nn.Module):
|
||||
act_layer: type[nn.Module] = QuickGELU,
|
||||
norm_layer: Callable[[int], nn.Module] | None = None,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
@@ -282,7 +279,6 @@ class Ernie4_5_VisionBlock(nn.Module):
|
||||
num_heads=num_heads,
|
||||
projection_size=dim,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.attn",
|
||||
)
|
||||
|
||||
@@ -357,7 +353,6 @@ class Ernie4_5_VisionTransformer(nn.Module):
|
||||
vision_config,
|
||||
norm_eps: float = 1e-6,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
@@ -393,7 +388,6 @@ class Ernie4_5_VisionTransformer(nn.Module):
|
||||
mlp_ratio=mlp_ratio,
|
||||
norm_layer=norm_layer,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.blocks.{layer_idx}",
|
||||
)
|
||||
for layer_idx in range(depth)
|
||||
@@ -405,13 +399,9 @@ class Ernie4_5_VisionTransformer(nn.Module):
|
||||
)
|
||||
self.ln = nn.LayerNorm(hidden_size, eps=1e-6)
|
||||
|
||||
attn_backend_override = (
|
||||
multimodal_config.mm_encoder_attn_backend if multimodal_config else None
|
||||
)
|
||||
self.attn_backend = get_vit_attn_backend(
|
||||
head_size=head_dim,
|
||||
dtype=torch.get_default_dtype(),
|
||||
attn_backend_override=attn_backend_override,
|
||||
)
|
||||
|
||||
@property
|
||||
@@ -1308,7 +1298,6 @@ class Ernie4_5_VLMoeForConditionalGeneration(
|
||||
config.vision_config,
|
||||
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=maybe_prefix(prefix, "vision_model"),
|
||||
)
|
||||
self.resampler_model = VariableResolutionResamplerModel(
|
||||
|
||||
@@ -46,7 +46,7 @@ from transformers.models.glm4v.image_processing_glm4v import (
|
||||
from transformers.models.glm4v.video_processing_glm4v import Glm4vVideoProcessor
|
||||
from transformers.video_utils import VideoMetadata
|
||||
|
||||
from vllm.config import MultiModalConfig, VllmConfig
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
|
||||
from vllm.distributed import get_tensor_model_parallel_world_size, parallel_state
|
||||
from vllm.distributed import utils as dist_utils
|
||||
@@ -107,6 +107,7 @@ from .utils import (
|
||||
)
|
||||
from .vision import (
|
||||
get_vit_attn_backend,
|
||||
is_vit_use_data_parallel,
|
||||
run_dp_sharded_mrope_vision_model,
|
||||
)
|
||||
|
||||
@@ -196,15 +197,10 @@ class Glm4vVisionMLP(nn.Module):
|
||||
hidden_features: int,
|
||||
bias: bool = False,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
use_data_parallel = (
|
||||
multimodal_config.mm_encoder_tp_mode == "data"
|
||||
if multimodal_config
|
||||
else False
|
||||
)
|
||||
use_data_parallel = is_vit_use_data_parallel()
|
||||
self.gate_up_proj = MergedColumnParallelLinear(
|
||||
input_size=in_features,
|
||||
output_sizes=[hidden_features] * 2,
|
||||
@@ -258,16 +254,11 @@ class Glm4vVisionAttention(nn.Module):
|
||||
num_heads: int,
|
||||
projection_size: int,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
# Per attention head and per partition values.
|
||||
use_data_parallel = (
|
||||
multimodal_config.mm_encoder_tp_mode == "data"
|
||||
if multimodal_config
|
||||
else False
|
||||
)
|
||||
use_data_parallel = is_vit_use_data_parallel()
|
||||
self.tp_size = (
|
||||
1 if use_data_parallel else get_tensor_model_parallel_world_size()
|
||||
)
|
||||
@@ -305,7 +296,6 @@ class Glm4vVisionAttention(nn.Module):
|
||||
num_heads=self.num_attention_heads_per_partition,
|
||||
head_size=self.hidden_size_per_attention_head,
|
||||
scale=self.hidden_size_per_attention_head**-0.5,
|
||||
multimodal_config=multimodal_config,
|
||||
)
|
||||
|
||||
self.apply_rotary_emb = ApplyRotaryEmb(enforce_enable=True)
|
||||
@@ -373,7 +363,6 @@ class Glm4vVisionBlock(nn.Module):
|
||||
mlp_hidden_dim: int,
|
||||
norm_layer: Callable[[int], nn.Module] | None = None,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
@@ -386,7 +375,6 @@ class Glm4vVisionBlock(nn.Module):
|
||||
num_heads=num_heads,
|
||||
projection_size=dim,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.attn",
|
||||
)
|
||||
self.mlp = Glm4vVisionMLP(
|
||||
@@ -394,7 +382,6 @@ class Glm4vVisionBlock(nn.Module):
|
||||
mlp_hidden_dim,
|
||||
bias=False,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.mlp",
|
||||
)
|
||||
|
||||
@@ -454,16 +441,11 @@ class Glm4vPatchMerger(nn.Module):
|
||||
d_model: int,
|
||||
context_dim: int,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
bias: bool = False,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
use_data_parallel = (
|
||||
multimodal_config.mm_encoder_tp_mode == "data"
|
||||
if multimodal_config
|
||||
else False
|
||||
)
|
||||
use_data_parallel = is_vit_use_data_parallel()
|
||||
self.hidden_size = d_model
|
||||
self.proj = ColumnParallelLinear(
|
||||
self.hidden_size,
|
||||
@@ -619,13 +601,10 @@ class Glm4vVisionTransformer(nn.Module):
|
||||
vision_config: Glm4vVisionConfig,
|
||||
norm_eps: float = 1e-6,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
assert multimodal_config is not None, "multimodal_config must be provided"
|
||||
|
||||
patch_size = vision_config.patch_size
|
||||
temporal_patch_size = vision_config.temporal_patch_size
|
||||
in_channels = vision_config.in_channels
|
||||
@@ -660,7 +639,6 @@ class Glm4vVisionTransformer(nn.Module):
|
||||
mlp_hidden_dim=vision_config.out_hidden_size,
|
||||
norm_layer=norm_layer,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.blocks.{layer_idx}",
|
||||
)
|
||||
for layer_idx in range(depth)
|
||||
@@ -670,7 +648,6 @@ class Glm4vVisionTransformer(nn.Module):
|
||||
d_model=vision_config.out_hidden_size,
|
||||
context_dim=vision_config.intermediate_size,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
bias=False,
|
||||
prefix=f"{prefix}.merger",
|
||||
)
|
||||
@@ -692,7 +669,6 @@ class Glm4vVisionTransformer(nn.Module):
|
||||
self.attn_backend = get_vit_attn_backend(
|
||||
head_size=head_dim,
|
||||
dtype=torch.get_default_dtype(),
|
||||
attn_backend_override=multimodal_config.mm_encoder_attn_backend,
|
||||
)
|
||||
|
||||
@property
|
||||
@@ -1439,7 +1415,6 @@ class Glm4vForConditionalGeneration(
|
||||
config.vision_config,
|
||||
norm_eps=getattr(config, "rms_norm_eps", 1e-5),
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=maybe_prefix(prefix, "visual"),
|
||||
)
|
||||
|
||||
|
||||
@@ -33,7 +33,7 @@ import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from transformers import BatchFeature
|
||||
|
||||
from vllm.config import MultiModalConfig, VllmConfig
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions
|
||||
from vllm.distributed import parallel_state
|
||||
from vllm.distributed import utils as dist_utils
|
||||
@@ -80,7 +80,6 @@ from vllm.transformers_utils.configs.hunyuan_vl import (
|
||||
from vllm.transformers_utils.processors.hunyuan_vl import HunYuanVLProcessor
|
||||
from vllm.transformers_utils.processors.hunyuan_vl_image import smart_resize
|
||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||
from vllm.v1.attention.backends.registry import AttentionBackendEnum
|
||||
|
||||
from .interfaces import (
|
||||
MultiModalEmbeddings,
|
||||
@@ -96,6 +95,7 @@ from .utils import (
|
||||
init_vllm_registered_model,
|
||||
maybe_prefix,
|
||||
)
|
||||
from .vision import is_vit_use_data_parallel
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@@ -160,9 +160,9 @@ class HunYuanVisionMLP(nn.Module):
|
||||
act_fn: Callable[[torch.Tensor], torch.Tensor] = F.gelu,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
prefix: str = "",
|
||||
use_data_parallel: bool = False,
|
||||
):
|
||||
super().__init__()
|
||||
use_data_parallel = is_vit_use_data_parallel()
|
||||
self.dense_h_to_4h = ColumnParallelLinear(
|
||||
in_features,
|
||||
hidden_features,
|
||||
@@ -194,12 +194,11 @@ class HunYuanVisionAttention(nn.Module):
|
||||
num_heads: int,
|
||||
projection_size: int,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
use_data_parallel: bool = False,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
# Per attention head and per partition values.
|
||||
use_data_parallel = is_vit_use_data_parallel()
|
||||
self.tp_size = (
|
||||
1
|
||||
if use_data_parallel
|
||||
@@ -237,7 +236,6 @@ class HunYuanVisionAttention(nn.Module):
|
||||
self.hidden_size_per_attention_head,
|
||||
self.scale,
|
||||
prefix=f"{prefix}.attn",
|
||||
multimodal_config=multimodal_config,
|
||||
)
|
||||
|
||||
def forward(
|
||||
@@ -260,9 +258,7 @@ class HunYuanVisionBlock(nn.Module):
|
||||
act_fn: Callable[[torch.Tensor], torch.Tensor] = F.gelu,
|
||||
norm_layer: Callable[[int], nn.Module] | None = None,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
use_data_parallel: bool = False,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
if norm_layer is None:
|
||||
@@ -274,9 +270,7 @@ class HunYuanVisionBlock(nn.Module):
|
||||
num_heads=num_heads,
|
||||
projection_size=dim,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.self_attn",
|
||||
use_data_parallel=use_data_parallel,
|
||||
)
|
||||
self.mlp = HunYuanVisionMLP(
|
||||
dim,
|
||||
@@ -285,7 +279,6 @@ class HunYuanVisionBlock(nn.Module):
|
||||
bias=True,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.mlp",
|
||||
use_data_parallel=use_data_parallel,
|
||||
)
|
||||
|
||||
def forward(
|
||||
@@ -439,9 +432,6 @@ class HunYuanVisionTransformer(nn.Module):
|
||||
vision_config: HunYuanVLVisionConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
prefix: str = "",
|
||||
use_data_parallel: bool = False,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
attn_backend_override: AttentionBackendEnum | None = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
@@ -467,9 +457,7 @@ class HunYuanVisionTransformer(nn.Module):
|
||||
act_fn=get_act_fn(vision_config.hidden_act),
|
||||
norm_layer=norm_layer,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.layers.{layer_idx}",
|
||||
use_data_parallel=use_data_parallel,
|
||||
)
|
||||
for layer_idx in range(num_hidden_layers)
|
||||
]
|
||||
@@ -872,23 +860,14 @@ class HunYuanVLForConditionalGeneration(
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
super().__init__()
|
||||
config: HunYuanVLConfig = vllm_config.model_config.hf_config
|
||||
multimodal_config = vllm_config.model_config.multimodal_config
|
||||
|
||||
self.config = config
|
||||
self.multimodal_config = multimodal_config
|
||||
|
||||
with self._mark_tower_model(vllm_config, {"image"}):
|
||||
attn_backend_override = (
|
||||
multimodal_config.mm_encoder_attn_backend
|
||||
if multimodal_config is not None
|
||||
else None
|
||||
)
|
||||
self.visual = HunYuanVisionTransformer(
|
||||
config.vision_config,
|
||||
quant_config=self.quant_config,
|
||||
quant_config=vllm_config.quant_config,
|
||||
prefix=maybe_prefix(prefix, "visual"),
|
||||
multimodal_config=multimodal_config,
|
||||
attn_backend_override=attn_backend_override,
|
||||
)
|
||||
|
||||
with self._mark_language_model(vllm_config):
|
||||
|
||||
@@ -17,7 +17,7 @@ from timm.models.regnet import RegStage
|
||||
from transformers import BatchFeature, CLIPVisionConfig, SiglipVisionConfig
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions, MultiModalConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.cache import BaseMultiModalProcessorCache
|
||||
@@ -360,7 +360,6 @@ def _build_hcxvision_hf_processor(
|
||||
def init_vision_tower_for_hcxvision(
|
||||
vision_config,
|
||||
quant_config: QuantizationConfig | None,
|
||||
multimodal_config: MultiModalConfig | None,
|
||||
*,
|
||||
use_nth_layer: int | None = None,
|
||||
require_post_norm: bool | None = None,
|
||||
@@ -378,7 +377,6 @@ def init_vision_tower_for_hcxvision(
|
||||
return CLIPVisionModel(
|
||||
vision_config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
num_hidden_layers_override=num_hidden_layers,
|
||||
require_post_norm=require_post_norm,
|
||||
prefix=prefix,
|
||||
@@ -387,7 +385,6 @@ def init_vision_tower_for_hcxvision(
|
||||
return SiglipVisionModel(
|
||||
vision_config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
num_hidden_layers_override=num_hidden_layers,
|
||||
require_post_norm=require_post_norm,
|
||||
prefix=prefix,
|
||||
@@ -605,7 +602,6 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
# init configs
|
||||
config = vllm_config.model_config.hf_config
|
||||
quant_config = vllm_config.quant_config
|
||||
multimodal_config = vllm_config.model_config.multimodal_config
|
||||
# text_config
|
||||
text_config = config.text_config
|
||||
if text_config.model_type in ["gpt2", "hyperclovax", "llama"]:
|
||||
@@ -628,7 +624,6 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
self.vision_model = init_vision_tower_for_hcxvision(
|
||||
vision_config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
use_nth_layer=getattr(config, "use_nth_layer", -1),
|
||||
require_post_norm=False,
|
||||
prefix=maybe_prefix(prefix, "vision_model"),
|
||||
|
||||
@@ -16,7 +16,7 @@ from transformers.image_processing_utils import BatchFeature
|
||||
from transformers.tokenization_utils import TensorType
|
||||
from typing_extensions import TypedDict, Unpack
|
||||
|
||||
from vllm.config import MultiModalConfig, VllmConfig
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config.model import ModelConfig
|
||||
from vllm.distributed import parallel_state
|
||||
from vllm.distributed import utils as dist_utils
|
||||
@@ -72,6 +72,8 @@ from vllm.transformers_utils.configs import (
|
||||
)
|
||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||
|
||||
from .vision import is_vit_use_data_parallel
|
||||
|
||||
|
||||
def create_cumulative_seq_lengths(
|
||||
seq_sizes: torch.Tensor, device: torch.device
|
||||
@@ -942,15 +944,10 @@ class Siglip2VisionAttention(nn.Module):
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
*,
|
||||
prefix: str = "",
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
use_data_parallel = (
|
||||
multimodal_config.mm_encoder_tp_mode == "data"
|
||||
if multimodal_config
|
||||
else False
|
||||
)
|
||||
use_data_parallel = is_vit_use_data_parallel()
|
||||
self.tp_size = (
|
||||
1
|
||||
if use_data_parallel
|
||||
@@ -987,7 +984,6 @@ class Siglip2VisionAttention(nn.Module):
|
||||
head_size=self.hidden_size_per_attention_head,
|
||||
scale=self.hidden_size_per_attention_head**-0.5,
|
||||
prefix=f"{prefix}.attn",
|
||||
multimodal_config=multimodal_config,
|
||||
)
|
||||
|
||||
def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
|
||||
@@ -1038,7 +1034,6 @@ class Siglip2EncoderLayer(nn.Module):
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
*,
|
||||
prefix: str = "",
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.embed_dim = config.hidden_size
|
||||
@@ -1047,7 +1042,6 @@ class Siglip2EncoderLayer(nn.Module):
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.self_attn",
|
||||
multimodal_config=multimodal_config,
|
||||
)
|
||||
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
self.mlp = SiglipMLP(
|
||||
@@ -1088,7 +1082,6 @@ class Siglip2Encoder(nn.Module):
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
*,
|
||||
prefix: str = "",
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.config = config
|
||||
@@ -1098,7 +1091,6 @@ class Siglip2Encoder(nn.Module):
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.layers.{layer_idx}",
|
||||
multimodal_config=multimodal_config,
|
||||
)
|
||||
for layer_idx in range(config.num_hidden_layers)
|
||||
]
|
||||
@@ -1127,7 +1119,6 @@ class Siglip2VisionTransformer(nn.Module):
|
||||
config: PixelShuffleSiglip2VisionConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
prefix: str = "",
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
@@ -1140,7 +1131,6 @@ class Siglip2VisionTransformer(nn.Module):
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.encoder",
|
||||
multimodal_config=multimodal_config,
|
||||
)
|
||||
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
||||
|
||||
@@ -1221,14 +1211,12 @@ class IsaacVisionEmbedding(nn.Module):
|
||||
hidden_dim: int,
|
||||
output_dim: int,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
self.transformer = Siglip2VisionTransformer(
|
||||
vision_cfg,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=maybe_prefix(prefix, "0"),
|
||||
)
|
||||
self.linear_fc1 = ColumnParallelLinear(
|
||||
@@ -1309,7 +1297,6 @@ class IsaacForConditionalGeneration(
|
||||
config: IsaacConfig = vllm_config.model_config.hf_config
|
||||
quant_config = vllm_config.quant_config
|
||||
self.config = config
|
||||
self.multimodal_config = vllm_config.model_config.multimodal_config
|
||||
|
||||
head_dim = config.head_dim
|
||||
calculated_mrope_section = [
|
||||
@@ -1373,7 +1360,6 @@ class IsaacForConditionalGeneration(
|
||||
hidden_dim=hidden_dim,
|
||||
output_dim=config.hidden_size,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=self.multimodal_config,
|
||||
prefix=maybe_prefix(prefix, "vision_embedding"),
|
||||
)
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@ from transformers.feature_extraction_utils import BatchFeature
|
||||
from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
|
||||
from transformers.utils import torch_int
|
||||
|
||||
from vllm.config import MultiModalConfig, VllmConfig
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions
|
||||
from vllm.distributed import get_tensor_model_parallel_world_size
|
||||
from vllm.logger import init_logger
|
||||
@@ -80,6 +80,7 @@ from .utils import (
|
||||
is_pp_missing_parameter,
|
||||
maybe_prefix,
|
||||
)
|
||||
from .vision import is_vit_use_data_parallel
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@@ -358,7 +359,6 @@ class KeyeSiglipAttention(nn.Module):
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
@@ -366,7 +366,8 @@ class KeyeSiglipAttention(nn.Module):
|
||||
|
||||
hidden_size = config.hidden_size
|
||||
self.hidden_size = config.hidden_size
|
||||
tp_size = get_tensor_model_parallel_world_size()
|
||||
use_data_parallel = is_vit_use_data_parallel()
|
||||
tp_size = 1 if use_data_parallel else get_tensor_model_parallel_world_size()
|
||||
self.total_num_heads = config.num_attention_heads
|
||||
assert self.total_num_heads % tp_size == 0
|
||||
self.num_heads = self.total_num_heads // tp_size
|
||||
@@ -403,7 +404,6 @@ class KeyeSiglipAttention(nn.Module):
|
||||
scale=self.scale,
|
||||
num_kv_heads=self.num_kv_heads,
|
||||
prefix=f"{prefix}.attn",
|
||||
multimodal_config=multimodal_config,
|
||||
)
|
||||
|
||||
self.apply_rotary_emb = ApplyRotaryEmb(
|
||||
@@ -497,7 +497,6 @@ class KeyeSiglipEncoderLayer(nn.Module):
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
@@ -506,14 +505,12 @@ class KeyeSiglipEncoderLayer(nn.Module):
|
||||
self.self_attn = KeyeSiglipAttention(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.self_attn",
|
||||
)
|
||||
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
self.mlp = SiglipMLP(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.mlp",
|
||||
)
|
||||
|
||||
@@ -552,7 +549,6 @@ class KeyeSiglipEncoder(nn.Module):
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
@@ -565,7 +561,6 @@ class KeyeSiglipEncoder(nn.Module):
|
||||
KeyeSiglipEncoderLayer(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.layers.{layer_idx}",
|
||||
)
|
||||
for layer_idx in range(config.num_hidden_layers)
|
||||
@@ -647,7 +642,6 @@ class KeyeSiglipVisionTransformer(nn.Module):
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
@@ -658,7 +652,6 @@ class KeyeSiglipVisionTransformer(nn.Module):
|
||||
self.encoder = KeyeSiglipEncoder(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.encoder",
|
||||
)
|
||||
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
||||
@@ -730,7 +723,6 @@ class KeyeSiglipVisionModel(nn.Module):
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
@@ -738,7 +730,6 @@ class KeyeSiglipVisionModel(nn.Module):
|
||||
self.vision_model = KeyeSiglipVisionTransformer(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.vision_model",
|
||||
)
|
||||
self.quant_config = quant_config
|
||||
@@ -1275,16 +1266,13 @@ class BaseKeyeModule(nn.Module, SupportsMultiModal):
|
||||
super().__init__()
|
||||
config: PretrainedConfig = vllm_config.model_config.hf_config
|
||||
quant_config = vllm_config.quant_config
|
||||
multimodal_config = vllm_config.model_config.multimodal_config
|
||||
|
||||
self.config = config
|
||||
self.multimodal_config = multimodal_config
|
||||
|
||||
with self._mark_tower_model(vllm_config, {"image", "video"}):
|
||||
self.visual = KeyeSiglipVisionModel(
|
||||
config.vision_config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=maybe_prefix(prefix, "visual"),
|
||||
)
|
||||
self.mlp_AR = self._build_projector(
|
||||
|
||||
@@ -317,7 +317,6 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
with self._mark_tower_model(vllm_config, "image"):
|
||||
self.vision_tower = MoonVitPretrainedModel(
|
||||
config.vision_config,
|
||||
multimodal_config=model_config.multimodal_config,
|
||||
prefix=maybe_prefix(prefix, "vision_tower"),
|
||||
)
|
||||
self.multi_modal_projector = KimiVLMultiModalProjector(
|
||||
|
||||
@@ -11,7 +11,6 @@ from torch.nn import functional as F
|
||||
from transformers import Siglip2VisionConfig
|
||||
|
||||
from vllm.compilation.decorators import support_torch_compile
|
||||
from vllm.config import MultiModalConfig
|
||||
from vllm.distributed import get_tensor_model_parallel_world_size
|
||||
from vllm.model_executor.layers.activation import get_act_fn
|
||||
from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
|
||||
@@ -23,7 +22,7 @@ from vllm.model_executor.layers.linear import (
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
|
||||
from .vision import should_torch_compile_mm_vit
|
||||
from .vision import is_vit_use_data_parallel, should_torch_compile_mm_vit
|
||||
|
||||
|
||||
class Siglip2VisionEmbeddings(nn.Module):
|
||||
@@ -154,7 +153,6 @@ class Siglip2Attention(nn.Module):
|
||||
self,
|
||||
config: Siglip2VisionConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
@@ -171,10 +169,7 @@ class Siglip2Attention(nn.Module):
|
||||
self.scale = self.head_dim**-0.5
|
||||
self.dropout = config.attention_dropout
|
||||
|
||||
use_data_parallel = (
|
||||
multimodal_config is not None
|
||||
and multimodal_config.mm_encoder_tp_mode == "data"
|
||||
)
|
||||
use_data_parallel = is_vit_use_data_parallel()
|
||||
tp_size = 1 if use_data_parallel else get_tensor_model_parallel_world_size()
|
||||
assert self.num_heads % tp_size == 0
|
||||
self.num_heads_per_partition = self.num_heads // tp_size
|
||||
@@ -199,7 +194,6 @@ class Siglip2Attention(nn.Module):
|
||||
head_size=self.head_dim,
|
||||
scale=self.scale,
|
||||
prefix=f"{prefix}.attn",
|
||||
multimodal_config=multimodal_config,
|
||||
)
|
||||
|
||||
def forward(
|
||||
@@ -241,16 +235,12 @@ class Siglip2MLP(nn.Module):
|
||||
self,
|
||||
config: Siglip2VisionConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.activation_fn = get_act_fn(config.hidden_act)
|
||||
use_data_parallel = (
|
||||
multimodal_config is not None
|
||||
and multimodal_config.mm_encoder_tp_mode == "data"
|
||||
)
|
||||
use_data_parallel = is_vit_use_data_parallel()
|
||||
self.fc1 = ColumnParallelLinear(
|
||||
config.hidden_size,
|
||||
config.intermediate_size,
|
||||
@@ -282,7 +272,6 @@ class Siglip2EncoderLayer(nn.Module):
|
||||
self,
|
||||
config: Siglip2VisionConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
@@ -291,14 +280,12 @@ class Siglip2EncoderLayer(nn.Module):
|
||||
self.self_attn = Siglip2Attention(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.self_attn",
|
||||
)
|
||||
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
self.mlp = Siglip2MLP(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.mlp",
|
||||
)
|
||||
|
||||
@@ -344,7 +331,6 @@ class Siglip2Encoder(nn.Module):
|
||||
self,
|
||||
config: Siglip2VisionConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
@@ -354,7 +340,6 @@ class Siglip2Encoder(nn.Module):
|
||||
Siglip2EncoderLayer(
|
||||
config=config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.layers.{idx}",
|
||||
)
|
||||
for idx in range(config.num_hidden_layers)
|
||||
@@ -383,7 +368,6 @@ class Siglip2VisionTransformer(nn.Module):
|
||||
self,
|
||||
config: Siglip2VisionConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
@@ -397,7 +381,6 @@ class Siglip2VisionTransformer(nn.Module):
|
||||
self.encoder = Siglip2Encoder(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.encoder",
|
||||
)
|
||||
num_hidden_layers = config.num_hidden_layers
|
||||
@@ -438,7 +421,6 @@ class Siglip2Model(torch.nn.Module):
|
||||
self,
|
||||
config: Siglip2VisionConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
@@ -446,7 +428,6 @@ class Siglip2Model(torch.nn.Module):
|
||||
self.vision_model = Siglip2VisionTransformer(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.vision_model",
|
||||
)
|
||||
|
||||
|
||||
@@ -600,7 +600,6 @@ class Lfm2VLForConditionalGeneration(
|
||||
self.vision_tower = Siglip2Model(
|
||||
config=vision_config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=maybe_prefix(prefix, "vision_tower"),
|
||||
)
|
||||
else:
|
||||
|
||||
@@ -166,7 +166,6 @@ class LightOnOCRForConditionalGeneration(Mistral3ForConditionalGeneration):
|
||||
self.vision_tower = init_vision_tower_for_llava(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
require_post_norm=False,
|
||||
prefix=maybe_prefix(prefix, "vision_tower"),
|
||||
)
|
||||
|
||||
@@ -19,7 +19,7 @@ from transformers.models.llava import LlavaProcessor
|
||||
from transformers.models.pixtral import PixtralProcessor
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions, MultiModalConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions
|
||||
from vllm.model_executor.layers.activation import get_act_fn
|
||||
from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
@@ -456,7 +456,6 @@ def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int:
|
||||
def init_vision_tower_for_llava(
|
||||
hf_config: LlavaLikeConfig,
|
||||
quant_config: QuantizationConfig | None,
|
||||
multimodal_config: MultiModalConfig | None,
|
||||
*,
|
||||
require_post_norm: bool | None = None,
|
||||
prefix: str = "",
|
||||
@@ -470,7 +469,6 @@ def init_vision_tower_for_llava(
|
||||
return CLIPVisionModel(
|
||||
vision_config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
num_hidden_layers_override=num_hidden_layers,
|
||||
require_post_norm=require_post_norm,
|
||||
prefix=prefix,
|
||||
@@ -479,7 +477,6 @@ def init_vision_tower_for_llava(
|
||||
return SiglipVisionModel(
|
||||
vision_config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
num_hidden_layers_override=num_hidden_layers,
|
||||
require_post_norm=require_post_norm,
|
||||
prefix=prefix,
|
||||
@@ -488,7 +485,6 @@ def init_vision_tower_for_llava(
|
||||
return PixtralHFVisionModel(
|
||||
vision_config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
num_hidden_layers_override=num_hidden_layers,
|
||||
require_post_norm=require_post_norm,
|
||||
prefix=prefix,
|
||||
@@ -562,7 +558,6 @@ class LlavaForConditionalGeneration(
|
||||
self.vision_tower = init_vision_tower_for_llava(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
require_post_norm=False,
|
||||
prefix=maybe_prefix(prefix, "vision_tower"),
|
||||
)
|
||||
|
||||
@@ -272,7 +272,6 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
|
||||
self.vision_tower = init_vision_tower_for_llava(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
require_post_norm=False,
|
||||
prefix=maybe_prefix(prefix, "vision_tower"),
|
||||
)
|
||||
|
||||
@@ -332,7 +332,6 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, Supp
|
||||
self.vision_tower = init_vision_tower_for_llava(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
require_post_norm=False,
|
||||
prefix=maybe_prefix(prefix, "vision_tower"),
|
||||
)
|
||||
|
||||
@@ -513,7 +513,6 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, Supp
|
||||
self.vision_tower = init_vision_tower_for_llava(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
require_post_norm=False,
|
||||
prefix=maybe_prefix(prefix, "vision_tower"),
|
||||
)
|
||||
|
||||
@@ -205,7 +205,6 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal, Support
|
||||
self.vision_tower = init_vision_tower_for_llava(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
require_post_norm=False,
|
||||
prefix=maybe_prefix(prefix, "vision_tower"),
|
||||
)
|
||||
|
||||
@@ -16,7 +16,7 @@ from transformers import (
|
||||
from transformers.models.pixtral import PixtralProcessor
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions, MultiModalConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions
|
||||
from vllm.model_executor.layers.activation import get_act_fn
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
|
||||
@@ -382,7 +382,6 @@ def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int:
|
||||
def init_vision_tower_for_llava(
|
||||
hf_config: LlavaLikeConfig,
|
||||
quant_config: QuantizationConfig | None,
|
||||
multimodal_config: MultiModalConfig | None,
|
||||
*,
|
||||
require_post_norm: bool | None = None,
|
||||
prefix: str = "",
|
||||
@@ -397,7 +396,6 @@ def init_vision_tower_for_llava(
|
||||
return PixtralHFVisionModel(
|
||||
vision_config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
num_hidden_layers_override=num_hidden_layers,
|
||||
require_post_norm=require_post_norm,
|
||||
prefix=prefix,
|
||||
@@ -461,7 +459,6 @@ class Mistral3ForConditionalGeneration(
|
||||
self.vision_tower = init_vision_tower_for_llava(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
require_post_norm=False,
|
||||
prefix=maybe_prefix(prefix, "vision_tower"),
|
||||
)
|
||||
|
||||
@@ -52,7 +52,6 @@ import torch.nn.functional as F
|
||||
from transformers.activations import ACT2FN
|
||||
from transformers.modeling_utils import PreTrainedModel
|
||||
|
||||
from vllm.config import MultiModalConfig
|
||||
from vllm.distributed import divide, get_tensor_model_parallel_world_size
|
||||
from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
|
||||
from vllm.model_executor.layers.conv import Conv2dLayer
|
||||
@@ -62,6 +61,7 @@ from vllm.model_executor.layers.linear import (
|
||||
RowParallelLinear,
|
||||
)
|
||||
from vllm.model_executor.models.utils import maybe_prefix
|
||||
from vllm.model_executor.models.vision import is_vit_use_data_parallel
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.transformers_utils.configs.moonvit import MoonViTConfig
|
||||
|
||||
@@ -308,11 +308,10 @@ class MLP2(nn.Module):
|
||||
activation,
|
||||
bias: bool = True,
|
||||
prefix: str = "",
|
||||
use_data_parallel: bool = False,
|
||||
):
|
||||
super().__init__()
|
||||
assert len(dims) == 3
|
||||
self.use_data_parallel = use_data_parallel
|
||||
self.use_data_parallel = is_vit_use_data_parallel()
|
||||
self.fc0 = ColumnParallelLinear(
|
||||
dims[0],
|
||||
dims[1],
|
||||
@@ -343,17 +342,12 @@ class MoonVitEncoderLayer(nn.Module):
|
||||
hidden_dim: int,
|
||||
mlp_dim: int,
|
||||
prefix: str = "",
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
*,
|
||||
activation=F.gelu,
|
||||
attn_bias: bool = False,
|
||||
):
|
||||
super().__init__()
|
||||
self.use_data_parallel = (
|
||||
multimodal_config.mm_encoder_tp_mode == "data"
|
||||
if multimodal_config
|
||||
else False
|
||||
)
|
||||
self.use_data_parallel = is_vit_use_data_parallel()
|
||||
|
||||
self.num_heads = num_heads
|
||||
self.hidden_dim = hidden_dim
|
||||
@@ -369,7 +363,6 @@ class MoonVitEncoderLayer(nn.Module):
|
||||
[hidden_dim, mlp_dim, hidden_dim],
|
||||
activation,
|
||||
prefix=f"{prefix}.mlp",
|
||||
use_data_parallel=self.use_data_parallel,
|
||||
)
|
||||
self.wqkv = QKVParallelLinear(
|
||||
hidden_size=hidden_dim,
|
||||
@@ -391,7 +384,6 @@ class MoonVitEncoderLayer(nn.Module):
|
||||
num_heads=self.num_attention_heads_per_partition,
|
||||
head_size=self.hidden_size_per_attention_head,
|
||||
scale=self.hidden_size_per_attention_head**-0.5,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.attn",
|
||||
)
|
||||
|
||||
@@ -469,7 +461,6 @@ class MoonVitEncoder(nn.Module):
|
||||
num_layers: int,
|
||||
block_cfg: dict,
|
||||
prefix: str = "",
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
@@ -479,7 +470,6 @@ class MoonVitEncoder(nn.Module):
|
||||
self.blocks = nn.ModuleList(
|
||||
[
|
||||
MoonVitEncoderLayer(
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.blocks.{layer_idx}",
|
||||
**block_cfg,
|
||||
)
|
||||
@@ -550,7 +540,6 @@ class MoonVitPretrainedModel(PreTrainedModel):
|
||||
def __init__(
|
||||
self,
|
||||
config: MoonViTConfig,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
*inputs,
|
||||
**kwargs,
|
||||
@@ -579,7 +568,6 @@ class MoonVitPretrainedModel(PreTrainedModel):
|
||||
"attn_bias": True,
|
||||
},
|
||||
prefix=f"{prefix}.encoder",
|
||||
multimodal_config=multimodal_config,
|
||||
)
|
||||
|
||||
def forward(
|
||||
|
||||
@@ -244,7 +244,6 @@ class OpenCUAForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
|
||||
vision_config=config.vision_config,
|
||||
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
|
||||
quant_config=self.quant_config,
|
||||
multimodal_config=self.multimodal_config,
|
||||
prefix=maybe_prefix(prefix, "visual"),
|
||||
)
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@ import torch
|
||||
import torch.nn as nn
|
||||
from transformers import BaseImageProcessor, BatchFeature, PretrainedConfig
|
||||
|
||||
from vllm.config import MultiModalConfig, VllmConfig
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions
|
||||
from vllm.model_executor.layers.linear import ReplicatedLinear
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
@@ -103,7 +103,6 @@ class VisualTokenizer(torch.nn.Module):
|
||||
config: PretrainedConfig,
|
||||
visual_vocab_size: int,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
@@ -111,7 +110,6 @@ class VisualTokenizer(torch.nn.Module):
|
||||
self.vit = self._init_backbone(
|
||||
config=config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.vit",
|
||||
)
|
||||
# reserved tokens for INDICATOR_IDS
|
||||
@@ -130,7 +128,6 @@ class VisualTokenizer(torch.nn.Module):
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: QuantizationConfig | None = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
model_type = config.model_type
|
||||
@@ -138,7 +135,6 @@ class VisualTokenizer(torch.nn.Module):
|
||||
return Siglip2NavitModel(
|
||||
config=config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=prefix,
|
||||
)
|
||||
raise ValueError(f"Unsupported visual tokenizer model_type: {model_type}")
|
||||
@@ -464,7 +460,6 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
super().__init__()
|
||||
config = vllm_config.model_config.hf_config
|
||||
quant_config = vllm_config.quant_config
|
||||
multimodal_config = vllm_config.model_config.multimodal_config
|
||||
|
||||
self.config: PretrainedConfig = config
|
||||
|
||||
@@ -478,7 +473,6 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
self.visual_tokenizer = VisualTokenizer(
|
||||
config=config.vit_config,
|
||||
visual_vocab_size=config.visual_vocab_size,
|
||||
multimodal_config=multimodal_config,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.visual_tokenizer",
|
||||
)
|
||||
|
||||
@@ -30,7 +30,7 @@ from transformers.modeling_outputs import (
|
||||
)
|
||||
from transformers.utils import torch_int
|
||||
|
||||
from vllm.config import MultiModalConfig, VllmConfig
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions
|
||||
from vllm.distributed import parallel_state
|
||||
from vllm.distributed import utils as dist_utils
|
||||
@@ -532,7 +532,6 @@ class SiglipAttention(nn.Module):
|
||||
num_heads: int,
|
||||
projection_size: int,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
@@ -565,7 +564,6 @@ class SiglipAttention(nn.Module):
|
||||
num_heads=self.num_attention_heads_per_partition,
|
||||
head_size=self.hidden_size_per_attention_head,
|
||||
scale=self.hidden_size_per_attention_head**-0.5,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.attn",
|
||||
)
|
||||
self.apply_rotary_emb = ApplyRotaryEmb(
|
||||
@@ -662,7 +660,6 @@ class SiglipEncoderLayer(nn.Module):
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
@@ -673,14 +670,12 @@ class SiglipEncoderLayer(nn.Module):
|
||||
num_heads=config.num_attention_heads,
|
||||
projection_size=config.hidden_size,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.self_attn",
|
||||
)
|
||||
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
self.mlp = SiglipMLP(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.mlp",
|
||||
)
|
||||
|
||||
@@ -718,7 +713,6 @@ class SiglipEncoder(nn.Module):
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
@@ -727,13 +721,9 @@ class SiglipEncoder(nn.Module):
|
||||
num_heads = config.num_attention_heads
|
||||
head_dim = embed_dim // num_heads
|
||||
|
||||
attn_backend_override = (
|
||||
multimodal_config.mm_encoder_attn_backend if multimodal_config else None
|
||||
)
|
||||
self.attn_backend = get_vit_attn_backend(
|
||||
head_size=head_dim,
|
||||
dtype=torch.get_default_dtype(),
|
||||
attn_backend_override=attn_backend_override,
|
||||
)
|
||||
if self.attn_backend not in {
|
||||
AttentionBackendEnum.FLASH_ATTN,
|
||||
@@ -748,7 +738,6 @@ class SiglipEncoder(nn.Module):
|
||||
SiglipEncoderLayer(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.layers.{layer_idx}",
|
||||
)
|
||||
for layer_idx in range(config.num_hidden_layers)
|
||||
@@ -830,7 +819,6 @@ class SiglipVisionTransformer(nn.Module):
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
@@ -841,7 +829,6 @@ class SiglipVisionTransformer(nn.Module):
|
||||
self.encoder = SiglipEncoder(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.encoder",
|
||||
)
|
||||
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
||||
@@ -880,7 +867,6 @@ class SiglipVisionModel(nn.Module):
|
||||
self,
|
||||
config,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
@@ -888,7 +874,6 @@ class SiglipVisionModel(nn.Module):
|
||||
self.vision_model = SiglipVisionTransformer(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.vision_model",
|
||||
)
|
||||
self.quant_config = quant_config
|
||||
@@ -1010,16 +995,13 @@ class PaddleOCRVLForConditionalGeneration(nn.Module, SupportsMultiModal, Support
|
||||
super().__init__()
|
||||
config = vllm_config.model_config.hf_config
|
||||
quant_config = vllm_config.quant_config
|
||||
multimodal_config = vllm_config.model_config.multimodal_config
|
||||
|
||||
self.config = config
|
||||
self.multimodal_config = multimodal_config
|
||||
|
||||
with self._mark_tower_model(vllm_config, "image"):
|
||||
self.visual = SiglipVisionModel(
|
||||
config=config.vision_config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=maybe_prefix(prefix, "visual"),
|
||||
)
|
||||
self.mlp_AR = Projector(config, config.vision_config)
|
||||
|
||||
@@ -29,7 +29,7 @@ from transformers import (
|
||||
)
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions, MultiModalConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
|
||||
@@ -96,7 +96,6 @@ CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(
|
||||
def _init_img_processor(
|
||||
hf_config: PretrainedConfig,
|
||||
quant_config: QuantizationConfig | None,
|
||||
multimodal_config: MultiModalConfig | None,
|
||||
prefix: str = "",
|
||||
) -> CLIPVisionModel:
|
||||
clip_config = CLIP_VIT_LARGE_PATCH14_336_CONFIG
|
||||
@@ -111,7 +110,6 @@ def _init_img_processor(
|
||||
img_processor = CLIPVisionModel(
|
||||
clip_config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
num_hidden_layers_override=num_hidden_layers,
|
||||
prefix=prefix,
|
||||
)
|
||||
@@ -170,7 +168,6 @@ class Phi3HDImageEmbedding(nn.Module):
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
quant_config: QuantizationConfig | None,
|
||||
multimodal_config: MultiModalConfig | None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
@@ -181,7 +178,6 @@ class Phi3HDImageEmbedding(nn.Module):
|
||||
self.img_processor = _init_img_processor(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.img_processor",
|
||||
)
|
||||
|
||||
@@ -596,7 +592,6 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant)
|
||||
self.vision_embed_tokens = Phi3HDImageEmbedding(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=maybe_prefix(prefix, "model.vision_embed_tokens"),
|
||||
)
|
||||
|
||||
|
||||
@@ -28,7 +28,7 @@ from transformers.models.pixtral.modeling_pixtral import (
|
||||
from transformers.tokenization_utils_base import TextInput
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions, MultiModalConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions
|
||||
from vllm.distributed import divide, get_tensor_model_parallel_world_size
|
||||
from vllm.model_executor.layers.activation import get_act_and_mul_fn
|
||||
from vllm.model_executor.layers.conv import Conv2dLayer
|
||||
@@ -74,6 +74,7 @@ from .utils import init_vllm_registered_model, maybe_prefix
|
||||
from .vision import (
|
||||
VisionEncoderInfo,
|
||||
VisionFeatureSelectStrategy,
|
||||
is_vit_use_data_parallel,
|
||||
resolve_visual_encoder_outputs,
|
||||
)
|
||||
|
||||
@@ -1065,17 +1066,12 @@ class PixtralHFMLP(nn.Module):
|
||||
self,
|
||||
config: PixtralVisionConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
*,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
use_data_parallel = (
|
||||
multimodal_config.mm_encoder_tp_mode == "data"
|
||||
if multimodal_config
|
||||
else False
|
||||
)
|
||||
use_data_parallel = is_vit_use_data_parallel()
|
||||
|
||||
assert config.intermediate_size is not None
|
||||
self.gate_up_proj = MergedColumnParallelLinear(
|
||||
@@ -1108,7 +1104,6 @@ class PixtralHFAttention(nn.Module):
|
||||
self,
|
||||
config: PixtralVisionConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
*,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
@@ -1120,11 +1115,7 @@ class PixtralHFAttention(nn.Module):
|
||||
self.head_dim = config.hidden_size // config.num_attention_heads
|
||||
assert self.total_num_heads * self.head_dim == config.hidden_size
|
||||
|
||||
use_data_parallel = (
|
||||
multimodal_config.mm_encoder_tp_mode == "data"
|
||||
if multimodal_config
|
||||
else False
|
||||
)
|
||||
use_data_parallel = is_vit_use_data_parallel()
|
||||
self.qkv_proj = QKVParallelLinear(
|
||||
hidden_size=config.hidden_size,
|
||||
head_size=self.head_dim,
|
||||
@@ -1189,7 +1180,6 @@ class PixtralHFTransformerBlock(nn.Module):
|
||||
self,
|
||||
config: PixtralVisionConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
*,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
@@ -1199,13 +1189,11 @@ class PixtralHFTransformerBlock(nn.Module):
|
||||
self.attention = PixtralHFAttention(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.attention",
|
||||
)
|
||||
self.feed_forward = PixtralHFMLP(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.feed_forward",
|
||||
)
|
||||
self.ffn_norm = RMSNorm(config.hidden_size, eps=1e-5)
|
||||
@@ -1232,7 +1220,6 @@ class PixtralHFTransformer(nn.Module):
|
||||
self,
|
||||
config: PixtralVisionConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
*,
|
||||
num_hidden_layers_override: int | None = None,
|
||||
prefix: str = "",
|
||||
@@ -1249,7 +1236,6 @@ class PixtralHFTransformer(nn.Module):
|
||||
PixtralHFTransformerBlock(
|
||||
config=config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.layers.{layer_idx}",
|
||||
)
|
||||
for layer_idx in range(num_hidden_layers)
|
||||
@@ -1281,7 +1267,6 @@ class PixtralHFVisionModel(nn.Module):
|
||||
self,
|
||||
config: PixtralVisionConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
*,
|
||||
num_hidden_layers_override: int | None = None,
|
||||
require_post_norm: bool | None = None,
|
||||
@@ -1302,7 +1287,6 @@ class PixtralHFVisionModel(nn.Module):
|
||||
self.transformer = PixtralHFTransformer(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
num_hidden_layers_override=num_hidden_layers_override,
|
||||
prefix=f"{prefix}.transformer",
|
||||
)
|
||||
|
||||
@@ -846,7 +846,6 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
|
||||
norm_eps=getattr(thinker_config.text_config, "rms_norm_eps", 1e-6),
|
||||
quant_config=quant_config,
|
||||
prefix=maybe_prefix(prefix, "visual"),
|
||||
multimodal_config=multimodal_config,
|
||||
)
|
||||
|
||||
with self._mark_language_model(vllm_config):
|
||||
|
||||
@@ -43,7 +43,7 @@ from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
|
||||
)
|
||||
|
||||
from vllm.compilation.decorators import support_torch_compile
|
||||
from vllm.config import MultiModalConfig, VllmConfig
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.distributed import parallel_state
|
||||
from vllm.distributed import utils as dist_utils
|
||||
from vllm.forward_context import set_forward_context
|
||||
@@ -109,6 +109,7 @@ from .utils import (
|
||||
)
|
||||
from .vision import (
|
||||
get_vit_attn_backend,
|
||||
is_vit_use_data_parallel,
|
||||
run_dp_sharded_mrope_vision_model,
|
||||
)
|
||||
|
||||
@@ -266,15 +267,10 @@ class Qwen2_5_VisionMLP(nn.Module):
|
||||
bias: bool = False,
|
||||
act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
use_data_parallel = (
|
||||
multimodal_config.mm_encoder_tp_mode == "data"
|
||||
if multimodal_config
|
||||
else False
|
||||
)
|
||||
use_data_parallel = is_vit_use_data_parallel()
|
||||
self.gate_up_proj = MergedColumnParallelLinear(
|
||||
input_size=in_features,
|
||||
output_sizes=[hidden_features] * 2, # [gate_proj, up_proj]
|
||||
@@ -308,16 +304,11 @@ class Qwen2_5_VisionAttention(nn.Module):
|
||||
num_heads: int,
|
||||
projection_size: int,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
# Per attention head and per partition values.
|
||||
use_data_parallel = (
|
||||
multimodal_config.mm_encoder_tp_mode == "data"
|
||||
if multimodal_config
|
||||
else False
|
||||
)
|
||||
use_data_parallel = is_vit_use_data_parallel()
|
||||
self.tp_size = (
|
||||
1
|
||||
if use_data_parallel
|
||||
@@ -354,7 +345,6 @@ class Qwen2_5_VisionAttention(nn.Module):
|
||||
num_heads=self.num_attention_heads_per_partition,
|
||||
head_size=self.hidden_size_per_attention_head,
|
||||
scale=self.hidden_size_per_attention_head**-0.5,
|
||||
multimodal_config=multimodal_config,
|
||||
)
|
||||
|
||||
self.apply_rotary_emb = ApplyRotaryEmb(enforce_enable=True)
|
||||
@@ -435,7 +425,6 @@ class Qwen2_5_VisionBlock(nn.Module):
|
||||
act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
|
||||
norm_layer: Callable[[int], nn.Module] | None = None,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
@@ -448,7 +437,6 @@ class Qwen2_5_VisionBlock(nn.Module):
|
||||
num_heads=num_heads,
|
||||
projection_size=dim,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.attn",
|
||||
)
|
||||
self.mlp = Qwen2_5_VisionMLP(
|
||||
@@ -457,7 +445,6 @@ class Qwen2_5_VisionBlock(nn.Module):
|
||||
act_fn=act_fn,
|
||||
bias=True,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.mlp",
|
||||
)
|
||||
|
||||
@@ -530,15 +517,10 @@ class Qwen2_5_VisionPatchMerger(nn.Module):
|
||||
norm_layer: Callable[[int], nn.Module] | None = None,
|
||||
spatial_merge_size: int = 2,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
use_data_parallel = (
|
||||
multimodal_config.mm_encoder_tp_mode == "data"
|
||||
if multimodal_config
|
||||
else False
|
||||
)
|
||||
use_data_parallel = is_vit_use_data_parallel()
|
||||
self.hidden_size = context_dim * (spatial_merge_size**2)
|
||||
if norm_layer is None:
|
||||
norm_layer = partial(nn.LayerNorm, eps=1e-6)
|
||||
@@ -579,7 +561,6 @@ class Qwen2_5_VisionTransformer(nn.Module):
|
||||
vision_config: Qwen2_5_VLVisionConfig,
|
||||
norm_eps: float = 1e-6,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
@@ -620,15 +601,9 @@ class Qwen2_5_VisionTransformer(nn.Module):
|
||||
rope_parameters={"partial_rotary_factor": 0.5},
|
||||
)
|
||||
|
||||
attn_backend_override = (
|
||||
multimodal_config.mm_encoder_attn_backend
|
||||
if multimodal_config is not None
|
||||
else None
|
||||
)
|
||||
self.attn_backend = get_vit_attn_backend(
|
||||
head_size=head_dim,
|
||||
dtype=torch.get_default_dtype(),
|
||||
attn_backend_override=attn_backend_override,
|
||||
)
|
||||
|
||||
if self.attn_backend not in {
|
||||
@@ -650,7 +625,6 @@ class Qwen2_5_VisionTransformer(nn.Module):
|
||||
act_fn=get_act_and_mul_fn(vision_config.hidden_act),
|
||||
norm_layer=norm_layer,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.blocks.{layer_idx}",
|
||||
)
|
||||
for layer_idx in range(depth)
|
||||
@@ -664,7 +638,6 @@ class Qwen2_5_VisionTransformer(nn.Module):
|
||||
norm_layer=norm_layer,
|
||||
spatial_merge_size=self.spatial_merge_size,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.merger",
|
||||
)
|
||||
|
||||
@@ -1152,7 +1125,6 @@ class Qwen2_5_VLForConditionalGeneration(
|
||||
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
|
||||
quant_config=self.quant_config,
|
||||
prefix=maybe_prefix(prefix, "visual"),
|
||||
multimodal_config=multimodal_config,
|
||||
)
|
||||
|
||||
with self._mark_language_model(vllm_config):
|
||||
|
||||
@@ -43,7 +43,7 @@ from transformers.models.qwen2_vl.configuration_qwen2_vl import (
|
||||
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
|
||||
from transformers.models.qwen2_vl.video_processing_qwen2_vl import Qwen2VLVideoProcessor
|
||||
|
||||
from vllm.config import MultiModalConfig, VllmConfig
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions
|
||||
from vllm.distributed import parallel_state, tensor_model_parallel_all_gather
|
||||
from vllm.distributed import utils as dist_utils
|
||||
@@ -106,6 +106,7 @@ from .utils import (
|
||||
)
|
||||
from .vision import (
|
||||
get_vit_attn_backend,
|
||||
is_vit_use_data_parallel,
|
||||
run_dp_sharded_mrope_vision_model,
|
||||
)
|
||||
|
||||
@@ -247,15 +248,10 @@ class Qwen2VisionMLP(nn.Module):
|
||||
hidden_features: int,
|
||||
act_layer: type[nn.Module] = QuickGELU,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
use_data_parallel = (
|
||||
multimodal_config.mm_encoder_tp_mode == "data"
|
||||
if multimodal_config
|
||||
else False
|
||||
)
|
||||
use_data_parallel = is_vit_use_data_parallel()
|
||||
self.fc1 = ColumnParallelLinear(
|
||||
in_features,
|
||||
hidden_features,
|
||||
@@ -286,16 +282,11 @@ class Qwen2VisionAttention(nn.Module):
|
||||
num_heads: int,
|
||||
projection_size: int,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
# Per attention head and per partition values.
|
||||
use_data_parallel = (
|
||||
multimodal_config.mm_encoder_tp_mode == "data"
|
||||
if multimodal_config
|
||||
else False
|
||||
)
|
||||
use_data_parallel = is_vit_use_data_parallel()
|
||||
self.tp_size = (
|
||||
1
|
||||
if use_data_parallel
|
||||
@@ -328,7 +319,6 @@ class Qwen2VisionAttention(nn.Module):
|
||||
num_heads=self.num_attention_heads_per_partition,
|
||||
head_size=self.hidden_size_per_attention_head,
|
||||
scale=self.hidden_size_per_attention_head**-0.5,
|
||||
multimodal_config=multimodal_config,
|
||||
)
|
||||
|
||||
self.apply_rotary_emb = ApplyRotaryEmb(enforce_enable=True)
|
||||
@@ -409,7 +399,6 @@ class Qwen2VisionBlock(nn.Module):
|
||||
act_layer: type[nn.Module] = QuickGELU,
|
||||
norm_layer: Callable[[int], nn.Module] | None = None,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
@@ -424,7 +413,6 @@ class Qwen2VisionBlock(nn.Module):
|
||||
num_heads=num_heads,
|
||||
projection_size=dim,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.attn",
|
||||
)
|
||||
self.mlp = Qwen2VisionMLP(
|
||||
@@ -432,7 +420,6 @@ class Qwen2VisionBlock(nn.Module):
|
||||
mlp_hidden_dim,
|
||||
act_layer=act_layer,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.mlp",
|
||||
)
|
||||
|
||||
@@ -493,15 +480,10 @@ class Qwen2VisionPatchMerger(nn.Module):
|
||||
norm_layer: Callable[[int], nn.Module] | None = None,
|
||||
spatial_merge_size: int = 2,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
use_data_parallel = (
|
||||
multimodal_config.mm_encoder_tp_mode == "data"
|
||||
if multimodal_config
|
||||
else False
|
||||
)
|
||||
use_data_parallel = is_vit_use_data_parallel()
|
||||
self.hidden_size = context_dim * (spatial_merge_size**2)
|
||||
if norm_layer is None:
|
||||
norm_layer = partial(nn.LayerNorm, eps=1e-6)
|
||||
@@ -545,7 +527,6 @@ class Qwen2VisionTransformer(nn.Module):
|
||||
vision_config: Qwen2VLVisionConfig,
|
||||
norm_eps: float = 1e-6,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
@@ -560,11 +541,7 @@ class Qwen2VisionTransformer(nn.Module):
|
||||
num_heads = vision_config.num_heads
|
||||
mlp_ratio = vision_config.mlp_ratio
|
||||
|
||||
self.use_data_parallel = (
|
||||
multimodal_config.mm_encoder_tp_mode == "data"
|
||||
if multimodal_config
|
||||
else False
|
||||
)
|
||||
self.use_data_parallel = is_vit_use_data_parallel()
|
||||
self.out_hidden_size = vision_config.hidden_size
|
||||
|
||||
self.spatial_merge_size = spatial_merge_size
|
||||
@@ -596,7 +573,6 @@ class Qwen2VisionTransformer(nn.Module):
|
||||
norm_layer=norm_layer,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.blocks.{layer_idx}",
|
||||
multimodal_config=multimodal_config,
|
||||
)
|
||||
for layer_idx in range(depth)
|
||||
]
|
||||
@@ -607,15 +583,10 @@ class Qwen2VisionTransformer(nn.Module):
|
||||
norm_layer=norm_layer,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.merger",
|
||||
multimodal_config=multimodal_config,
|
||||
)
|
||||
attn_backend_override = (
|
||||
multimodal_config.mm_encoder_attn_backend if multimodal_config else None
|
||||
)
|
||||
self.attn_backend = get_vit_attn_backend(
|
||||
head_size=head_dim,
|
||||
dtype=torch.get_default_dtype(),
|
||||
attn_backend_override=attn_backend_override,
|
||||
)
|
||||
|
||||
@property
|
||||
@@ -1238,7 +1209,6 @@ class Qwen2VLForConditionalGeneration(
|
||||
config.vision_config,
|
||||
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=maybe_prefix(prefix, "visual"),
|
||||
)
|
||||
|
||||
|
||||
@@ -48,7 +48,7 @@ from transformers import __version__ as TRANSFORMERS_VERSION
|
||||
# isort: on
|
||||
|
||||
from vllm.compilation.decorators import support_torch_compile
|
||||
from vllm.config import MultiModalConfig, VllmConfig
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
|
||||
@@ -160,7 +160,6 @@ class Qwen3OmniMoeAudioAttention(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
config: Qwen3OmniMoeAudioEncoderConfig,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
@@ -198,7 +197,6 @@ class Qwen3OmniMoeAudioAttention(nn.Module):
|
||||
num_heads=self.num_local_heads,
|
||||
head_size=self.head_dim,
|
||||
scale=self.scaling,
|
||||
multimodal_config=multimodal_config,
|
||||
)
|
||||
|
||||
def forward(
|
||||
@@ -233,13 +231,12 @@ class Qwen3OmniMoeAudioEncoderLayer(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
config: Qwen3OmniMoeAudioEncoderConfig,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
self.embed_dim = config.d_model
|
||||
self.self_attn = Qwen3OmniMoeAudioAttention(
|
||||
config, multimodal_config=multimodal_config, prefix=f"{prefix}.self_attn"
|
||||
config, prefix=f"{prefix}.self_attn"
|
||||
)
|
||||
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
|
||||
self.activation_fn = _ACTIVATION_REGISTRY[config.activation_function]
|
||||
@@ -301,7 +298,6 @@ class Qwen3OmniMoeAudioEncoder(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
config: Qwen3OmniMoeAudioEncoderConfig,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
@@ -345,7 +341,6 @@ class Qwen3OmniMoeAudioEncoder(nn.Module):
|
||||
[
|
||||
Qwen3OmniMoeAudioEncoderLayer(
|
||||
config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.layers.{i}",
|
||||
)
|
||||
for i in range(config.encoder_layers)
|
||||
@@ -359,15 +354,9 @@ class Qwen3OmniMoeAudioEncoder(nn.Module):
|
||||
self.proj2 = nn.Linear(config.d_model, config.output_dim)
|
||||
|
||||
# Get attention backend
|
||||
attn_backend_override = (
|
||||
multimodal_config.mm_encoder_attn_backend
|
||||
if multimodal_config is not None
|
||||
else None
|
||||
)
|
||||
self.attn_backend = get_vit_attn_backend(
|
||||
head_size=config.d_model // config.encoder_attention_heads,
|
||||
dtype=torch.get_default_dtype(),
|
||||
attn_backend_override=attn_backend_override,
|
||||
)
|
||||
|
||||
def compute_attn_mask_seqlen(self, cu_seqlens: torch.Tensor) -> torch.Tensor | None:
|
||||
@@ -601,7 +590,6 @@ class Qwen3_VisionBlock(nn.Module):
|
||||
mlp_hidden_dim: int,
|
||||
act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
|
||||
norm_layer: Callable[[int], nn.Module] | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
@@ -615,7 +603,6 @@ class Qwen3_VisionBlock(nn.Module):
|
||||
num_heads=num_heads,
|
||||
projection_size=dim,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.attn",
|
||||
)
|
||||
self.mlp = Qwen3_VisionMLP(
|
||||
@@ -710,7 +697,6 @@ class Qwen3Omni_VisionTransformer(nn.Module):
|
||||
vision_config,
|
||||
norm_eps: float = 1e-6,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
@@ -758,7 +744,6 @@ class Qwen3Omni_VisionTransformer(nn.Module):
|
||||
act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act],
|
||||
norm_layer=norm_layer,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.blocks.{layer_idx}",
|
||||
)
|
||||
for layer_idx in range(vision_config.depth)
|
||||
@@ -788,16 +773,9 @@ class Qwen3Omni_VisionTransformer(nn.Module):
|
||||
]
|
||||
)
|
||||
|
||||
attn_backend_override = (
|
||||
multimodal_config.mm_encoder_attn_backend
|
||||
if multimodal_config is not None
|
||||
else None
|
||||
)
|
||||
|
||||
self.attn_backend = get_vit_attn_backend(
|
||||
head_size=head_dim,
|
||||
dtype=torch.get_default_dtype(),
|
||||
attn_backend_override=attn_backend_override,
|
||||
)
|
||||
|
||||
@property
|
||||
@@ -1617,7 +1595,6 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
|
||||
with self._mark_tower_model(vllm_config, "audio"):
|
||||
self.audio_tower = Qwen3OmniMoeAudioEncoder(
|
||||
thinker_config.audio_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=maybe_prefix(prefix, "audio_tower"),
|
||||
)
|
||||
|
||||
@@ -1638,7 +1615,6 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
|
||||
norm_eps=getattr(thinker_config.text_config, "rms_norm_eps", 1e-6),
|
||||
quant_config=quant_config,
|
||||
prefix=maybe_prefix(prefix, "visual"),
|
||||
multimodal_config=multimodal_config,
|
||||
)
|
||||
|
||||
# register buffer for deepstack
|
||||
|
||||
@@ -49,7 +49,7 @@ from transformers.models.qwen3_vl.video_processing_qwen3_vl import (
|
||||
from transformers.video_utils import VideoMetadata
|
||||
|
||||
from vllm.compilation.decorators import support_torch_compile
|
||||
from vllm.config import MultiModalConfig, VllmConfig
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
|
||||
from vllm.distributed import get_pp_group
|
||||
from vllm.logger import init_logger
|
||||
@@ -123,6 +123,7 @@ from .utils import (
|
||||
)
|
||||
from .vision import (
|
||||
get_vit_attn_backend,
|
||||
is_vit_use_data_parallel,
|
||||
run_dp_sharded_mrope_vision_model,
|
||||
)
|
||||
|
||||
@@ -169,15 +170,10 @@ class Qwen3_VisionMLP(nn.Module):
|
||||
bias: bool = False,
|
||||
act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
use_data_parallel = (
|
||||
multimodal_config.mm_encoder_tp_mode == "data"
|
||||
if multimodal_config
|
||||
else False
|
||||
)
|
||||
use_data_parallel = is_vit_use_data_parallel()
|
||||
self.linear_fc1 = ColumnParallelLinear(
|
||||
in_features,
|
||||
hidden_features,
|
||||
@@ -211,7 +207,6 @@ class Qwen3_VisionBlock(nn.Module):
|
||||
mlp_hidden_dim: int,
|
||||
act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
|
||||
norm_layer: Callable[[int], nn.Module] | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
@@ -225,7 +220,6 @@ class Qwen3_VisionBlock(nn.Module):
|
||||
num_heads=num_heads,
|
||||
projection_size=dim,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.attn",
|
||||
)
|
||||
self.mlp = Qwen3_VisionMLP(
|
||||
@@ -234,7 +228,6 @@ class Qwen3_VisionBlock(nn.Module):
|
||||
act_fn=act_fn,
|
||||
bias=True,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.mlp",
|
||||
)
|
||||
|
||||
@@ -267,15 +260,10 @@ class Qwen3_VisionPatchMerger(nn.Module):
|
||||
spatial_merge_size: int = 2,
|
||||
use_postshuffle_norm: bool = False,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
use_data_parallel = (
|
||||
multimodal_config.mm_encoder_tp_mode == "data"
|
||||
if multimodal_config
|
||||
else False
|
||||
)
|
||||
use_data_parallel = is_vit_use_data_parallel()
|
||||
self.hidden_size = context_dim * (spatial_merge_size**2)
|
||||
|
||||
self.use_postshuffle_norm = use_postshuffle_norm
|
||||
@@ -321,7 +309,6 @@ class Qwen3_VisionTransformer(nn.Module):
|
||||
vision_config: Qwen3VLVisionConfig,
|
||||
norm_eps: float = 1e-6,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
@@ -365,7 +352,6 @@ class Qwen3_VisionTransformer(nn.Module):
|
||||
norm_layer=norm_layer,
|
||||
spatial_merge_size=self.spatial_merge_size,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.merger",
|
||||
)
|
||||
|
||||
@@ -378,20 +364,15 @@ class Qwen3_VisionTransformer(nn.Module):
|
||||
use_postshuffle_norm=True,
|
||||
norm_layer=norm_layer,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.deepstack_merger_list.{layer_idx}",
|
||||
)
|
||||
for layer_idx in range(len(self.deepstack_visual_indexes))
|
||||
]
|
||||
)
|
||||
|
||||
attn_backend_override = (
|
||||
multimodal_config.mm_encoder_attn_backend if multimodal_config else None
|
||||
)
|
||||
self.attn_backend = get_vit_attn_backend(
|
||||
head_size=head_dim,
|
||||
dtype=torch.get_default_dtype(),
|
||||
attn_backend_override=attn_backend_override,
|
||||
)
|
||||
|
||||
if self.attn_backend not in {
|
||||
@@ -411,7 +392,6 @@ class Qwen3_VisionTransformer(nn.Module):
|
||||
act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act],
|
||||
norm_layer=norm_layer,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.blocks.{layer_idx}",
|
||||
)
|
||||
for layer_idx in range(vision_config.depth)
|
||||
@@ -1291,7 +1271,6 @@ class Qwen3VLForConditionalGeneration(
|
||||
config.vision_config,
|
||||
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=maybe_prefix(prefix, "visual"),
|
||||
)
|
||||
|
||||
|
||||
@@ -446,7 +446,6 @@ class Qwen3VLMoeForConditionalGeneration(
|
||||
config.vision_config,
|
||||
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=maybe_prefix(prefix, "visual"),
|
||||
)
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@ from transformers import (
|
||||
)
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions, MultiModalConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions
|
||||
from vllm.distributed import divide, get_tensor_model_parallel_world_size
|
||||
from vllm.model_executor.layers.activation import get_act_fn
|
||||
from vllm.model_executor.layers.attention.encoder_only_attention import (
|
||||
@@ -64,6 +64,7 @@ from .vision import (
|
||||
VisionFeatureSelectStrategy,
|
||||
VisionFeatureSelectStrategyStr,
|
||||
get_num_selected_vision_tokens,
|
||||
is_vit_use_data_parallel,
|
||||
resolve_visual_encoder_outputs,
|
||||
)
|
||||
|
||||
@@ -356,7 +357,6 @@ class SiglipAttention(nn.Module):
|
||||
self,
|
||||
config: SiglipVisionConfig | SiglipTextConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
*,
|
||||
prefix: str = "",
|
||||
attn_cls: type[EncoderOnlyAttention] | type[MMEncoderAttention],
|
||||
@@ -376,11 +376,7 @@ class SiglipAttention(nn.Module):
|
||||
|
||||
self.scale = self.head_dim**-0.5
|
||||
|
||||
use_data_parallel = (
|
||||
multimodal_config.mm_encoder_tp_mode == "data"
|
||||
if multimodal_config
|
||||
else False
|
||||
)
|
||||
use_data_parallel = is_vit_use_data_parallel()
|
||||
self.qkv_proj = QKVParallelLinear(
|
||||
hidden_size=self.embed_dim,
|
||||
head_size=self.head_dim,
|
||||
@@ -409,7 +405,6 @@ class SiglipAttention(nn.Module):
|
||||
self.head_dim,
|
||||
self.scale,
|
||||
prefix=f"{prefix}.attn",
|
||||
multimodal_config=multimodal_config,
|
||||
)
|
||||
else:
|
||||
self.attn = attn_cls(
|
||||
@@ -437,17 +432,12 @@ class SiglipMLP(nn.Module):
|
||||
self,
|
||||
config: SiglipVisionConfig | SiglipTextConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.config = config
|
||||
use_data_parallel = (
|
||||
multimodal_config.mm_encoder_tp_mode == "data"
|
||||
if multimodal_config
|
||||
else False
|
||||
)
|
||||
use_data_parallel = is_vit_use_data_parallel()
|
||||
self.activation_fn = get_act_fn(config.hidden_act)
|
||||
|
||||
# Special handling for BNB and torchao quantization
|
||||
@@ -487,7 +477,6 @@ class SiglipEncoderLayer(nn.Module):
|
||||
self,
|
||||
config: SiglipVisionConfig | SiglipTextConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
*,
|
||||
prefix: str = "",
|
||||
attn_cls: type[EncoderOnlyAttention] | type[MMEncoderAttention],
|
||||
@@ -499,7 +488,6 @@ class SiglipEncoderLayer(nn.Module):
|
||||
self.self_attn = SiglipAttention(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.self_attn",
|
||||
attn_cls=attn_cls,
|
||||
)
|
||||
@@ -507,7 +495,6 @@ class SiglipEncoderLayer(nn.Module):
|
||||
self.mlp = SiglipMLP(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.mlp",
|
||||
)
|
||||
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
@@ -535,7 +522,6 @@ class SiglipEncoder(nn.Module):
|
||||
self,
|
||||
config: SiglipVisionConfig | SiglipTextConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
num_hidden_layers_override: int | None = None,
|
||||
*,
|
||||
prefix: str = "",
|
||||
@@ -555,7 +541,6 @@ class SiglipEncoder(nn.Module):
|
||||
SiglipEncoderLayer(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.layers.{layer_idx}",
|
||||
attn_cls=attn_cls,
|
||||
)
|
||||
@@ -660,7 +645,6 @@ class SiglipMultiheadAttentionPoolingHead(nn.Module):
|
||||
self,
|
||||
config: SiglipVisionConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
@@ -674,7 +658,6 @@ class SiglipMultiheadAttentionPoolingHead(nn.Module):
|
||||
self.mlp = SiglipMLP(
|
||||
config=config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.mlp",
|
||||
)
|
||||
|
||||
@@ -700,7 +683,6 @@ class SiglipVisionTransformer(nn.Module):
|
||||
self,
|
||||
config: SiglipVisionConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
*,
|
||||
num_hidden_layers_override: int | None = None,
|
||||
require_post_norm: bool | None = None,
|
||||
@@ -717,7 +699,6 @@ class SiglipVisionTransformer(nn.Module):
|
||||
self.encoder = SiglipEncoder(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
num_hidden_layers_override=num_hidden_layers_override,
|
||||
prefix=f"{prefix}.encoder",
|
||||
attn_cls=MMEncoderAttention,
|
||||
@@ -756,7 +737,6 @@ class SiglipVisionTransformer(nn.Module):
|
||||
SiglipMultiheadAttentionPoolingHead(
|
||||
config=config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.head",
|
||||
)
|
||||
if self.use_head
|
||||
@@ -870,7 +850,6 @@ class SiglipVisionModel(nn.Module):
|
||||
self,
|
||||
config: SiglipVisionConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
*,
|
||||
num_hidden_layers_override: int | None = None,
|
||||
require_post_norm: bool | None = None,
|
||||
@@ -883,7 +862,6 @@ class SiglipVisionModel(nn.Module):
|
||||
self.vision_model = SiglipVisionTransformer(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
num_hidden_layers_override=num_hidden_layers_override,
|
||||
require_post_norm=require_post_norm,
|
||||
prefix=f"{prefix}.vision_model",
|
||||
@@ -1062,9 +1040,7 @@ class SiglipEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
|
||||
|
||||
config: SiglipConfig = vllm_config.model_config.hf_config
|
||||
quant_config = vllm_config.quant_config
|
||||
multimodal_config = vllm_config.model_config.multimodal_config
|
||||
self.config = config
|
||||
self.multimodal_config = multimodal_config
|
||||
|
||||
if hasattr(config, "num_labels"):
|
||||
config.num_labels = 0
|
||||
@@ -1087,7 +1063,6 @@ class SiglipEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
|
||||
self.vision_model = SiglipVisionTransformer(
|
||||
vision_config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=maybe_prefix(prefix, "vision_model"),
|
||||
use_head=None, # Allows potential pooling head
|
||||
)
|
||||
|
||||
@@ -11,7 +11,6 @@ from torch.nn import functional as F
|
||||
from transformers import Siglip2VisionConfig
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
from vllm.config import MultiModalConfig
|
||||
from vllm.distributed import divide, get_tensor_model_parallel_world_size
|
||||
from vllm.model_executor.layers.activation import get_act_fn
|
||||
from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
|
||||
@@ -30,6 +29,8 @@ from vllm.model_executor.layers.rotary_embedding.common import (
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from .vision import is_vit_use_data_parallel
|
||||
|
||||
|
||||
class VisionRotaryEmbedding(nn.Module):
|
||||
def __init__(self, dim: int, theta: float = 10000.0) -> None:
|
||||
@@ -178,9 +179,7 @@ class Siglip2Attention(nn.Module):
|
||||
self,
|
||||
config: Siglip2VisionConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
use_data_parallel: bool = False,
|
||||
):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
@@ -196,11 +195,7 @@ class Siglip2Attention(nn.Module):
|
||||
self.scale = self.head_dim**-0.5
|
||||
self.dropout = config.attention_dropout
|
||||
|
||||
use_data_parallel = (
|
||||
multimodal_config.mm_encoder_tp_mode == "data"
|
||||
if multimodal_config
|
||||
else False
|
||||
)
|
||||
use_data_parallel = is_vit_use_data_parallel()
|
||||
self.qkv_proj = QKVParallelLinear(
|
||||
hidden_size=self.embed_dim,
|
||||
head_size=self.head_dim,
|
||||
@@ -228,7 +223,6 @@ class Siglip2Attention(nn.Module):
|
||||
head_size=self.head_dim,
|
||||
scale=self.scale,
|
||||
prefix=f"{prefix}.attn",
|
||||
multimodal_config=multimodal_config,
|
||||
)
|
||||
|
||||
self.apply_rotary_emb = ApplyRotaryEmb(
|
||||
@@ -287,16 +281,11 @@ class Siglip2MLP(nn.Module):
|
||||
self,
|
||||
config: Siglip2VisionConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
use_data_parallel = (
|
||||
multimodal_config.mm_encoder_tp_mode == "data"
|
||||
if multimodal_config
|
||||
else False
|
||||
)
|
||||
use_data_parallel = is_vit_use_data_parallel()
|
||||
self.activation_fn = get_act_fn(config.hidden_act)
|
||||
self.fc1 = ColumnParallelLinear(
|
||||
config.hidden_size,
|
||||
@@ -325,7 +314,6 @@ class Siglip2EncoderLayer(nn.Module):
|
||||
self,
|
||||
config: Siglip2VisionConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
@@ -334,14 +322,12 @@ class Siglip2EncoderLayer(nn.Module):
|
||||
self.self_attn = Siglip2Attention(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.self_attn",
|
||||
)
|
||||
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
self.mlp = Siglip2MLP(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.mlp",
|
||||
)
|
||||
|
||||
@@ -387,7 +373,6 @@ class Siglip2Encoder(nn.Module):
|
||||
self,
|
||||
config: Siglip2VisionConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
@@ -397,7 +382,6 @@ class Siglip2Encoder(nn.Module):
|
||||
Siglip2EncoderLayer(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.layers.{idx}",
|
||||
)
|
||||
for idx in range(config.num_hidden_layers)
|
||||
@@ -571,7 +555,6 @@ class Siglip2VisionTransformer(nn.Module):
|
||||
self,
|
||||
config: Siglip2VisionConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
@@ -582,7 +565,6 @@ class Siglip2VisionTransformer(nn.Module):
|
||||
self.encoder = Siglip2Encoder(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.encoder",
|
||||
)
|
||||
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
||||
@@ -610,7 +592,6 @@ class Siglip2NavitModel(torch.nn.Module):
|
||||
self,
|
||||
config: Siglip2VisionConfig,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
@@ -618,7 +599,6 @@ class Siglip2NavitModel(torch.nn.Module):
|
||||
self.vision_model = Siglip2VisionTransformer(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.vision_model",
|
||||
)
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ from transformers.models.llava import LlavaProcessor
|
||||
from transformers.processing_utils import ProcessingKwargs, Unpack
|
||||
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
||||
|
||||
from vllm.config import MultiModalConfig, VllmConfig
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.model_executor.layers.activation import get_act_fn
|
||||
from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
@@ -351,7 +351,6 @@ def _build_tarsier_hf_processor(
|
||||
def init_vision_tower_for_tarsier(
|
||||
hf_config: TarsierHfConfig, # Use the Tarsier specific config protocol
|
||||
quant_config: QuantizationConfig | None,
|
||||
multimodal_config: MultiModalConfig | None,
|
||||
*,
|
||||
require_post_norm: bool | None = None,
|
||||
prefix: str = "",
|
||||
@@ -378,7 +377,6 @@ def init_vision_tower_for_tarsier(
|
||||
return CLIPVisionModel(
|
||||
vision_config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
num_hidden_layers_override=num_hidden_layers_to_init,
|
||||
require_post_norm=require_post_norm,
|
||||
prefix=prefix,
|
||||
@@ -387,7 +385,6 @@ def init_vision_tower_for_tarsier(
|
||||
return SiglipVisionModel(
|
||||
vision_config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
num_hidden_layers_override=num_hidden_layers_to_init,
|
||||
require_post_norm=require_post_norm,
|
||||
prefix=prefix,
|
||||
@@ -420,7 +417,6 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
|
||||
|
||||
config: TarsierHfConfig = vllm_config.model_config.hf_config
|
||||
quant_config = vllm_config.quant_config
|
||||
multimodal_config = vllm_config.model_config.multimodal_config
|
||||
|
||||
self.config = config # Storing the Tarsier-specific HF config
|
||||
|
||||
@@ -428,7 +424,6 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
|
||||
self.vision_tower = init_vision_tower_for_tarsier(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
require_post_norm=False,
|
||||
prefix=maybe_prefix(prefix, "vision_tower"),
|
||||
)
|
||||
|
||||
@@ -10,7 +10,7 @@ from typing import Final, Generic, Literal, Protocol, TypeAlias, TypeVar
|
||||
import torch
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config import MultiModalConfig, VllmConfig, get_current_vllm_config
|
||||
from vllm.distributed import (
|
||||
get_tensor_model_parallel_rank,
|
||||
get_tensor_model_parallel_world_size,
|
||||
@@ -79,7 +79,7 @@ def get_vision_encoder_info(hf_config: VisionLanguageConfig) -> VisionEncoderInf
|
||||
raise NotImplementedError(msg)
|
||||
|
||||
|
||||
def get_vit_attn_backend(
|
||||
def _get_vit_attn_backend(
|
||||
head_size: int,
|
||||
dtype: torch.dtype,
|
||||
*,
|
||||
@@ -95,6 +95,52 @@ def get_vit_attn_backend(
|
||||
)
|
||||
|
||||
|
||||
def get_vit_attn_backend(
|
||||
head_size: int,
|
||||
dtype: torch.dtype,
|
||||
) -> AttentionBackendEnum:
|
||||
"""
|
||||
Get the attention backend for Vision Transformer.
|
||||
"""
|
||||
try:
|
||||
vllm_config: VllmConfig = get_current_vllm_config()
|
||||
multimodal_config: MultiModalConfig | None = (
|
||||
vllm_config.model_config.multimodal_config
|
||||
)
|
||||
except AssertionError:
|
||||
multimodal_config = None
|
||||
|
||||
attn_backend_override = (
|
||||
multimodal_config.mm_encoder_attn_backend
|
||||
if multimodal_config is not None
|
||||
else None
|
||||
)
|
||||
attn_backend = _get_vit_attn_backend(
|
||||
head_size,
|
||||
dtype,
|
||||
attn_backend_override=attn_backend_override,
|
||||
)
|
||||
return attn_backend
|
||||
|
||||
|
||||
def is_vit_use_data_parallel():
|
||||
"""
|
||||
Get the tensor parallel type for Vision Transformer.
|
||||
"""
|
||||
try:
|
||||
vllm_config: VllmConfig = get_current_vllm_config()
|
||||
multimodal_config: MultiModalConfig | None = (
|
||||
vllm_config.model_config.multimodal_config
|
||||
)
|
||||
except AssertionError:
|
||||
multimodal_config = None
|
||||
|
||||
mm_encoder_tp_mode = (
|
||||
multimodal_config.mm_encoder_tp_mode if multimodal_config is not None else None
|
||||
)
|
||||
return mm_encoder_tp_mode == "data"
|
||||
|
||||
|
||||
def should_torch_compile_mm_vit(vllm_config: VllmConfig) -> bool:
|
||||
"""Callable to be passed to `@support_torch_compile`'s `enable_if` argument."""
|
||||
return vllm_config.compilation_config.compile_mm_encoder
|
||||
|
||||
Reference in New Issue
Block a user