[Model] Standardize common vision encoders (#31947)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -16,7 +16,7 @@ from transformers import (
|
||||
from transformers.models.pixtral import PixtralProcessor
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions
|
||||
from vllm.config.multimodal import BaseDummyOptions, MultiModalConfig
|
||||
from vllm.model_executor.layers.activation import get_act_fn
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
|
||||
@@ -395,6 +395,7 @@ def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
|
||||
def init_vision_tower_for_llava(
|
||||
hf_config: LlavaLikeConfig,
|
||||
quant_config: QuantizationConfig | None,
|
||||
multimodal_config: MultiModalConfig | None,
|
||||
*,
|
||||
require_post_norm: bool | None = None,
|
||||
prefix: str = "",
|
||||
@@ -409,6 +410,7 @@ def init_vision_tower_for_llava(
|
||||
return PixtralHFVisionModel(
|
||||
vision_config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
num_hidden_layers_override=num_hidden_layers,
|
||||
require_post_norm=require_post_norm,
|
||||
prefix=prefix,
|
||||
@@ -472,7 +474,8 @@ class Mistral3ForConditionalGeneration(
|
||||
if multimodal_config.get_limit_per_prompt("image"):
|
||||
self.vision_tower = init_vision_tower_for_llava(
|
||||
config,
|
||||
quant_config,
|
||||
quant_config=quant_config,
|
||||
multimodal_config=multimodal_config,
|
||||
require_post_norm=False,
|
||||
prefix=maybe_prefix(prefix, "vision_tower"),
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user