[Models]: Make Multimodal config implicit in ViT implementation (#31972)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
@@ -49,7 +49,7 @@ from transformers.models.qwen3_vl.video_processing_qwen3_vl import (
|
||||
from transformers.video_utils import VideoMetadata
|
||||
|
||||
from vllm.compilation.decorators import support_torch_compile
|
||||
from vllm.config import MultiModalConfig, VllmConfig
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
|
||||
from vllm.distributed import get_pp_group
|
||||
from vllm.logger import init_logger
|
||||
@@ -123,6 +123,7 @@ from .utils import (
|
||||
)
|
||||
from .vision import (
|
||||
get_vit_attn_backend,
|
||||
is_vit_use_data_parallel,
|
||||
run_dp_sharded_mrope_vision_model,
|
||||
)
|
||||
|
||||
@@ -169,15 +170,10 @@ class Qwen3_VisionMLP(nn.Module):
|
||||
bias: bool = False,
|
||||
act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
use_data_parallel = (
|
||||
multimodal_config.mm_encoder_tp_mode == "data"
|
||||
if multimodal_config
|
||||
else False
|
||||
)
|
||||
use_data_parallel = is_vit_use_data_parallel()
|
||||
self.linear_fc1 = ColumnParallelLinear(
|
||||
in_features,
|
||||
hidden_features,
|
||||
@@ -211,7 +207,6 @@ class Qwen3_VisionBlock(nn.Module):
|
||||
mlp_hidden_dim: int,
|
||||
act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
|
||||
norm_layer: Callable[[int], nn.Module] | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
@@ -225,7 +220,6 @@ class Qwen3_VisionBlock(nn.Module):
|
||||
num_heads=num_heads,
|
||||
projection_size=dim,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.attn",
|
||||
)
|
||||
self.mlp = Qwen3_VisionMLP(
|
||||
@@ -234,7 +228,6 @@ class Qwen3_VisionBlock(nn.Module):
|
||||
act_fn=act_fn,
|
||||
bias=True,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.mlp",
|
||||
)
|
||||
|
||||
@@ -267,15 +260,10 @@ class Qwen3_VisionPatchMerger(nn.Module):
|
||||
spatial_merge_size: int = 2,
|
||||
use_postshuffle_norm: bool = False,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
use_data_parallel = (
|
||||
multimodal_config.mm_encoder_tp_mode == "data"
|
||||
if multimodal_config
|
||||
else False
|
||||
)
|
||||
use_data_parallel = is_vit_use_data_parallel()
|
||||
self.hidden_size = context_dim * (spatial_merge_size**2)
|
||||
|
||||
self.use_postshuffle_norm = use_postshuffle_norm
|
||||
@@ -321,7 +309,6 @@ class Qwen3_VisionTransformer(nn.Module):
|
||||
vision_config: Qwen3VLVisionConfig,
|
||||
norm_eps: float = 1e-6,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
multimodal_config: MultiModalConfig | None = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
@@ -365,7 +352,6 @@ class Qwen3_VisionTransformer(nn.Module):
|
||||
norm_layer=norm_layer,
|
||||
spatial_merge_size=self.spatial_merge_size,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.merger",
|
||||
)
|
||||
|
||||
@@ -378,20 +364,15 @@ class Qwen3_VisionTransformer(nn.Module):
|
||||
use_postshuffle_norm=True,
|
||||
norm_layer=norm_layer,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.deepstack_merger_list.{layer_idx}",
|
||||
)
|
||||
for layer_idx in range(len(self.deepstack_visual_indexes))
|
||||
]
|
||||
)
|
||||
|
||||
attn_backend_override = (
|
||||
multimodal_config.mm_encoder_attn_backend if multimodal_config else None
|
||||
)
|
||||
self.attn_backend = get_vit_attn_backend(
|
||||
head_size=head_dim,
|
||||
dtype=torch.get_default_dtype(),
|
||||
attn_backend_override=attn_backend_override,
|
||||
)
|
||||
|
||||
if self.attn_backend not in {
|
||||
@@ -411,7 +392,6 @@ class Qwen3_VisionTransformer(nn.Module):
|
||||
act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act],
|
||||
norm_layer=norm_layer,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=f"{prefix}.blocks.{layer_idx}",
|
||||
)
|
||||
for layer_idx in range(vision_config.depth)
|
||||
@@ -1291,7 +1271,6 @@ class Qwen3VLForConditionalGeneration(
|
||||
config.vision_config,
|
||||
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
prefix=maybe_prefix(prefix, "visual"),
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user