[Bugfix]Disable the post_norm layer of the vision encoder for LLaVA models (#9653)

This commit is contained in:
litianjian
2024-10-24 22:52:07 +08:00
committed by GitHub
parent b979143d5b
commit f58454968f
4 changed files with 8 additions and 4 deletions

View File

@@ -256,7 +256,8 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
self.multimodal_config = multimodal_config
# Initialize the vision tower only up to the required feature layer
self.vision_tower = init_vision_tower_for_llava(config, quant_config)
self.vision_tower = init_vision_tower_for_llava(
config, quant_config, require_post_norm=False)
self.vision_resampler = LlavaNextVideoPooler(config)
self.multi_modal_projector = LlavaNextMultiModalProjector(
vision_hidden_size=config.vision_config.hidden_size,