[VLM] Clean up Phi-4-MM ViT implementation (#14812)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
This commit is contained in:
Isotr0py
2025-03-16 09:53:52 +08:00
committed by GitHub
parent 3453b964a3
commit def232e122
7 changed files with 316 additions and 1988 deletions

View File

@@ -60,7 +60,7 @@ class AriaVisionTransformer(Idefics3VisionTransformer, SupportsQuant):
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
) -> None:
super().__init__(config, quant_config, prefix)
super().__init__(config, quant_config=quant_config, prefix=prefix)
# Unlike Idefics3VisionTransformer which uses LayerNorm after the
# final layer, Aria omits this normalization, so we replace it with an
# Identity layer
@@ -512,7 +512,7 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
self.config = config
self.vision_tower = AriaVisionTransformer(
config.vision_config,
quant_config,
quant_config=quant_config,
prefix=f"{prefix}.vision_tower",
)
self.multi_modal_projector = AriaProjector(config)