[Model] Use merge_by_field_config for MM models (Qwen series) (#27546)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-10-27 13:38:05 +08:00
committed by GitHub
parent 63b22e0dbb
commit cbd5e07a51
7 changed files with 36 additions and 305 deletions

View File

@@ -1071,6 +1071,8 @@ class Qwen2_5_VLForConditionalGeneration(
SupportsMultiModalPruning,
SupportsMRoPE,
):
merge_by_field_config = True
packed_modules_mapping = {
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
"gate_up_proj": ["gate_proj", "up_proj"],
@@ -1273,24 +1275,6 @@ class Qwen2_5_VLForConditionalGeneration(
num_layers = len(self.language_model.model.layers)
return (2, num_layers // 2, num_layers - 3)
def _validate_and_reshape_mm_tensor(
self, mm_input: object, name: str
) -> torch.Tensor:
if not isinstance(mm_input, (torch.Tensor, list)):
raise ValueError(f"Incorrect type of {name}. Got type: {type(mm_input)}")
if isinstance(mm_input, torch.Tensor):
if mm_input.ndim == 2:
return mm_input
if mm_input.ndim != 3:
raise ValueError(
f"{name} should be 2D or batched 3D tensor. "
f"Got ndim: {mm_input.ndim} "
f"(shape={mm_input.shape})"
)
return mm_input.reshape(-1, mm_input.shape[-1])
else:
return torch.concat(mm_input)
def _parse_and_validate_image_input(
self, **kwargs: object
) -> Qwen2_5_VLImageInputs | None:
@@ -1302,13 +1286,6 @@ class Qwen2_5_VLForConditionalGeneration(
return None
if pixel_values is not None:
pixel_values = self._validate_and_reshape_mm_tensor(
pixel_values, "image pixel values"
)
image_grid_thw = self._validate_and_reshape_mm_tensor(
image_grid_thw, "image grid_thw"
)
return Qwen2_5_VLImagePixelInputs(
type="pixel_values",
pixel_values=pixel_values,
@@ -1316,13 +1293,6 @@ class Qwen2_5_VLForConditionalGeneration(
)
if image_embeds is not None:
image_embeds = self._validate_and_reshape_mm_tensor(
image_embeds, "image embeds"
)
image_grid_thw = self._validate_and_reshape_mm_tensor(
image_grid_thw, "image grid_thw"
)
return Qwen2_5_VLImageEmbeddingInputs(
type="image_embeds",
image_embeds=image_embeds,
@@ -1341,14 +1311,6 @@ class Qwen2_5_VLForConditionalGeneration(
return None
if pixel_values_videos is not None:
pixel_values_videos = self._validate_and_reshape_mm_tensor(
pixel_values_videos, "video pixel values"
)
video_grid_thw = self._validate_and_reshape_mm_tensor(
video_grid_thw, "video grid_thw"
)
if second_per_grid_ts is not None and second_per_grid_ts.ndim == 2:
second_per_grid_ts = second_per_grid_ts.squeeze(-1)
return Qwen2_5_VLVideoPixelInputs(
type="pixel_values_videos",
pixel_values_videos=pixel_values_videos,
@@ -1357,13 +1319,6 @@ class Qwen2_5_VLForConditionalGeneration(
)
if video_embeds is not None:
video_embeds = self._validate_and_reshape_mm_tensor(
video_embeds, "video embeds"
)
video_grid_thw = self._validate_and_reshape_mm_tensor(
video_grid_thw, "video grid_thw"
)
return Qwen2_5_VLVideoEmbeddingInputs(
type="video_embeds",
video_embeds=video_embeds,