[Feature] Add LoRA tower/connector support for Llama 4 Vision (mllama4) (#35147)

Signed-off-by: dorhuri123 <dor.huri1@live.biu.ac.il>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
This commit is contained in:
Dor Huri
2026-02-24 14:10:32 +02:00
committed by GitHub
parent f1c664545b
commit 012dee9233

View File

@@ -1151,6 +1151,28 @@ class Llama4ForConditionalGeneration(
"""
return MultiModelKeys.from_string_field(
language_model="language_model",
connector="multi_modal_projector.",
connector=[
"multi_modal_projector.",
"vision_model.vision_adapter.",
],
tower_model="vision_model.",
)
def get_num_mm_encoder_tokens(self, num_image_tokens: int) -> int:
vision_config = self.config.vision_config
patches_per_chunk = Mllama4ProcessingInfo.get_patch_per_chunk(vision_config)
if num_image_tokens <= 0 or patches_per_chunk <= 0:
return 0
raw_patches = (vision_config.image_size // vision_config.patch_size) ** 2
num_chunks = num_image_tokens // patches_per_chunk
# Encoder processes raw_patches + 1 (CLS) per chunk
return num_chunks * (raw_patches + 1)
def get_num_mm_connector_tokens(self, num_vision_tokens: int) -> int:
vision_config = self.config.vision_config
raw_patches = (vision_config.image_size // vision_config.patch_size) ** 2
if num_vision_tokens <= 0:
return 0
num_chunks = num_vision_tokens // (raw_patches + 1)
patches_per_chunk = Mllama4ProcessingInfo.get_patch_per_chunk(vision_config)
return num_chunks * patches_per_chunk