[Model] Standardize common vision encoders (#31947)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2026-01-08 18:33:16 +08:00
committed by GitHub
parent d1b6fe007f
commit 5576227bc1
19 changed files with 253 additions and 173 deletions

View File

@@ -16,7 +16,7 @@ from transformers import (
from transformers.models.pixtral import PixtralProcessor
from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.config.multimodal import BaseDummyOptions, MultiModalConfig
from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
@@ -395,6 +395,7 @@ def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
def init_vision_tower_for_llava(
hf_config: LlavaLikeConfig,
quant_config: QuantizationConfig | None,
multimodal_config: MultiModalConfig | None,
*,
require_post_norm: bool | None = None,
prefix: str = "",
@@ -409,6 +410,7 @@ def init_vision_tower_for_llava(
return PixtralHFVisionModel(
vision_config,
quant_config=quant_config,
multimodal_config=multimodal_config,
num_hidden_layers_override=num_hidden_layers,
require_post_norm=require_post_norm,
prefix=prefix,
@@ -472,7 +474,8 @@ class Mistral3ForConditionalGeneration(
if multimodal_config.get_limit_per_prompt("image"):
self.vision_tower = init_vision_tower_for_llava(
config,
quant_config,
quant_config=quant_config,
multimodal_config=multimodal_config,
require_post_norm=False,
prefix=maybe_prefix(prefix, "vision_tower"),
)